[clang] [llvm] [NVPTX] use untyped loads and stores where ever possible (PR #137698)

Alex MacLean via cfe-commits cfe-commits at lists.llvm.org
Wed Apr 30 13:45:03 PDT 2025


https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/137698

>From 472e16f17edd2ab310a7a149196ab37028e64317 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Thu, 24 Apr 2025 15:31:19 +0000
Subject: [PATCH 1/2] [NVPTX] Remove load/store type

---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   |   67 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |   30 +-
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      |   66 +-
 .../test/CodeGen/NVPTX/LoadStoreVectorizer.ll |  174 +-
 llvm/test/CodeGen/NVPTX/MachineSink-call.ll   |    2 +-
 .../CodeGen/NVPTX/MachineSink-convergent.ll   |    2 +-
 llvm/test/CodeGen/NVPTX/access-non-generic.ll |   24 +-
 llvm/test/CodeGen/NVPTX/addr-mode.ll          |   20 +-
 .../CodeGen/NVPTX/addrspacecast-folding.ll    |    4 +-
 .../test/CodeGen/NVPTX/addrspacecast-ptx64.ll |   32 +-
 llvm/test/CodeGen/NVPTX/addrspacecast.ll      |   38 +-
 llvm/test/CodeGen/NVPTX/aggregate-return.ll   |   30 +-
 llvm/test/CodeGen/NVPTX/and-or-setcc.ll       |    8 +-
 llvm/test/CodeGen/NVPTX/anonymous-fn-param.ll |    4 +-
 llvm/test/CodeGen/NVPTX/applypriority.ll      |    4 +-
 llvm/test/CodeGen/NVPTX/atomics-sm70.ll       |   26 +-
 llvm/test/CodeGen/NVPTX/atomics-sm90.ll       |   26 +-
 llvm/test/CodeGen/NVPTX/atomics.ll            |  132 +-
 llvm/test/CodeGen/NVPTX/barrier.ll            |    4 +-
 llvm/test/CodeGen/NVPTX/bf16-instructions.ll  |  114 +-
 llvm/test/CodeGen/NVPTX/bf16.ll               |    4 +-
 .../test/CodeGen/NVPTX/bf16x2-instructions.ll |   40 +-
 llvm/test/CodeGen/NVPTX/bfe.ll                |   28 +-
 llvm/test/CodeGen/NVPTX/bswap.ll              |    8 +-
 llvm/test/CodeGen/NVPTX/bug21465.ll           |    6 +-
 llvm/test/CodeGen/NVPTX/bug22246.ll           |   10 +-
 llvm/test/CodeGen/NVPTX/bug26185-2.ll         |    2 +-
 llvm/test/CodeGen/NVPTX/bug26185.ll           |    8 +-
 .../CodeGen/NVPTX/call-with-alloca-buffer.ll  |    6 +-
 llvm/test/CodeGen/NVPTX/chain-different-as.ll |    4 +-
 llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll       | 1260 +++++------
 llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll       | 1260 +++++------
 llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll       | 1260 +++++------
 llvm/test/CodeGen/NVPTX/cmpxchg.ll            |  420 ++--
 llvm/test/CodeGen/NVPTX/combine-mad.ll        |   56 +-
 llvm/test/CodeGen/NVPTX/convert-fp-i8.ll      |   20 +-
 llvm/test/CodeGen/NVPTX/convert-int-sm20.ll   |   12 +-
 llvm/test/CodeGen/NVPTX/convert-sm100.ll      |    8 +-
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll     |   42 +-
 llvm/test/CodeGen/NVPTX/convert-sm80.ll       |   50 +-
 llvm/test/CodeGen/NVPTX/convert-sm90.ll       |    8 +-
 llvm/test/CodeGen/NVPTX/copysign.ll           |   36 +-
 .../CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll |  292 +--
 .../NVPTX/cp-async-bulk-tensor-prefetch.ll    |   62 +-
 .../NVPTX/cp-async-bulk-tensor-reduce.ll      |  102 +-
 .../CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll |  120 +-
 llvm/test/CodeGen/NVPTX/cp-async-bulk.ll      |   62 +-
 llvm/test/CodeGen/NVPTX/ctlz.ll               |   28 +-
 llvm/test/CodeGen/NVPTX/dag-cse.ll            |    8 +-
 llvm/test/CodeGen/NVPTX/demote-vars.ll        |    4 +-
 llvm/test/CodeGen/NVPTX/discard.ll            |    4 +-
 llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll   |    2 +-
 .../NVPTX/distributed-shared-cluster.ll       |   26 +-
 llvm/test/CodeGen/NVPTX/div.ll                |    6 +-
 llvm/test/CodeGen/NVPTX/dot-product.ll        |   74 +-
 .../NVPTX/dynamic-stackalloc-regression.ll    |   10 +-
 llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll |    4 +-
 llvm/test/CodeGen/NVPTX/elect.ll              |    4 +-
 llvm/test/CodeGen/NVPTX/extloadv.ll           |    2 +-
 llvm/test/CodeGen/NVPTX/extractelement.ll     |   14 +-
 llvm/test/CodeGen/NVPTX/f16-instructions.ll   |   64 +-
 llvm/test/CodeGen/NVPTX/f16x2-instructions.ll |   90 +-
 llvm/test/CodeGen/NVPTX/f32-ex2.ll            |    8 +-
 llvm/test/CodeGen/NVPTX/f32-lg2.ll            |    8 +-
 llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll    |   12 +-
 llvm/test/CodeGen/NVPTX/fexp2.ll              |   40 +-
 llvm/test/CodeGen/NVPTX/flo.ll                |   16 +-
 llvm/test/CodeGen/NVPTX/flog2.ll              |   16 +-
 llvm/test/CodeGen/NVPTX/fma-relu-contract.ll  |   24 +-
 .../CodeGen/NVPTX/fma-relu-fma-intrinsic.ll   |   18 +-
 .../NVPTX/fma-relu-instruction-flag.ll        |   36 +-
 llvm/test/CodeGen/NVPTX/fns.ll                |    6 +-
 llvm/test/CodeGen/NVPTX/forward-ld-param.ll   |   18 +-
 llvm/test/CodeGen/NVPTX/fp-contract.ll        |   42 +-
 llvm/test/CodeGen/NVPTX/fp128-storage-type.ll |   12 +-
 llvm/test/CodeGen/NVPTX/frem.ll               |   64 +-
 llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll |   20 +-
 llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll    |    6 +-
 llvm/test/CodeGen/NVPTX/globals_lowering.ll   |    4 +-
 llvm/test/CodeGen/NVPTX/half.ll               |    4 +-
 llvm/test/CodeGen/NVPTX/i1-ext-load.ll        |    8 +-
 llvm/test/CodeGen/NVPTX/i1-icmp.ll            |   40 +-
 llvm/test/CodeGen/NVPTX/i1-load-lower.ll      |    4 +-
 llvm/test/CodeGen/NVPTX/i1-select.ll          |   40 +-
 llvm/test/CodeGen/NVPTX/i128-array.ll         |   12 +-
 llvm/test/CodeGen/NVPTX/i128-param.ll         |   12 +-
 llvm/test/CodeGen/NVPTX/i128-retval.ll        |    8 +-
 llvm/test/CodeGen/NVPTX/i128.ll               |   28 +-
 llvm/test/CodeGen/NVPTX/i16x2-instructions.ll |  204 +-
 llvm/test/CodeGen/NVPTX/i8-param.ll           |    4 +-
 llvm/test/CodeGen/NVPTX/i8x2-instructions.ll  |    4 +-
 llvm/test/CodeGen/NVPTX/i8x4-instructions.ll  |  258 +--
 llvm/test/CodeGen/NVPTX/idioms.ll             |    8 +-
 llvm/test/CodeGen/NVPTX/indirect_byval.ll     |    8 +-
 .../CodeGen/NVPTX/inline-asm-b128-test1.ll    |   16 +-
 .../CodeGen/NVPTX/inline-asm-b128-test2.ll    |    8 +-
 .../CodeGen/NVPTX/inline-asm-b128-test3.ll    |    4 +-
 llvm/test/CodeGen/NVPTX/intrinsics.ll         |   40 +-
 llvm/test/CodeGen/NVPTX/jump-table.ll         |   12 +-
 llvm/test/CodeGen/NVPTX/ld-addrspace.ll       |   72 +-
 llvm/test/CodeGen/NVPTX/ld-generic.ll         |   24 +-
 llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py   |   17 +-
 llvm/test/CodeGen/NVPTX/ldg-invariant.ll      |   40 +-
 llvm/test/CodeGen/NVPTX/ldparam-v4.ll         |    6 +-
 llvm/test/CodeGen/NVPTX/ldu-i8.ll             |    2 +-
 llvm/test/CodeGen/NVPTX/ldu-ldg.ll            |   84 +-
 .../test/CodeGen/NVPTX/ldu-reg-plus-offset.ll |    4 +-
 llvm/test/CodeGen/NVPTX/load-sext-i1.ll       |    4 +-
 llvm/test/CodeGen/NVPTX/load-store-scalars.ll | 1152 +++++-----
 llvm/test/CodeGen/NVPTX/load-store-sm-70.ll   | 1920 ++++++++---------
 llvm/test/CodeGen/NVPTX/load-store-sm-90.ll   |  768 +++----
 llvm/test/CodeGen/NVPTX/load-store-vectors.ll |  528 ++---
 .../NVPTX/load-with-non-coherent-cache.ll     |   88 +-
 llvm/test/CodeGen/NVPTX/local-stack-frame.ll  |   32 +-
 llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll  |   22 +-
 llvm/test/CodeGen/NVPTX/lower-alloca.ll       |    4 +-
 .../CodeGen/NVPTX/lower-args-gridconstant.ll  |   80 +-
 llvm/test/CodeGen/NVPTX/lower-args.ll         |   58 +-
 llvm/test/CodeGen/NVPTX/lower-byval-args.ll   |  308 +--
 .../CodeGen/NVPTX/lower-kernel-ptr-arg.ll     |   24 +-
 llvm/test/CodeGen/NVPTX/machine-sink.ll       |    4 +-
 llvm/test/CodeGen/NVPTX/match.ll              |   16 +-
 llvm/test/CodeGen/NVPTX/math-intrins.ll       |  304 +--
 .../CodeGen/NVPTX/misaligned-vector-ldst.ll   |   80 +-
 llvm/test/CodeGen/NVPTX/misched_func_call.ll  |   12 +-
 llvm/test/CodeGen/NVPTX/mulhi-intrins.ll      |   24 +-
 llvm/test/CodeGen/NVPTX/nounroll.ll           |   16 +-
 .../CodeGen/NVPTX/nvvm-reflect-arch-O0.ll     |    6 +-
 llvm/test/CodeGen/NVPTX/param-add.ll          |    8 +-
 llvm/test/CodeGen/NVPTX/param-align.ll        |   30 +-
 llvm/test/CodeGen/NVPTX/param-load-store.ll   |  236 +-
 llvm/test/CodeGen/NVPTX/param-overalign.ll    |   28 +-
 .../CodeGen/NVPTX/param-vectorize-device.ll   |   50 +-
 .../CodeGen/NVPTX/param-vectorize-kernel.ll   |  208 +-
 llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll   |    8 +-
 llvm/test/CodeGen/NVPTX/pr16278.ll            |    2 +-
 llvm/test/CodeGen/NVPTX/prefetch.ll           |    8 +-
 .../CodeGen/NVPTX/proxy-reg-erasure-ptx.ll    |   18 +-
 llvm/test/CodeGen/NVPTX/rcp-opt.ll            |   12 +-
 .../NVPTX/read-global-variable-constant.ll    |    6 +-
 .../CodeGen/NVPTX/reduction-intrinsics.ll     |  356 +--
 llvm/test/CodeGen/NVPTX/redux-sync-f32.ll     |   48 +-
 llvm/test/CodeGen/NVPTX/reg-types.ll          |   20 +-
 llvm/test/CodeGen/NVPTX/rotate-add.ll         |   40 +-
 llvm/test/CodeGen/NVPTX/rotate.ll             |  164 +-
 llvm/test/CodeGen/NVPTX/rotate_64.ll          |    4 +-
 llvm/test/CodeGen/NVPTX/sad-intrins.ll        |   36 +-
 llvm/test/CodeGen/NVPTX/sched1.ll             |    8 +-
 llvm/test/CodeGen/NVPTX/sched2.ll             |    8 +-
 llvm/test/CodeGen/NVPTX/sext-params.ll        |    2 +-
 llvm/test/CodeGen/NVPTX/sext-setcc.ll         |    8 +-
 llvm/test/CodeGen/NVPTX/shfl-p.ll             |   64 +-
 llvm/test/CodeGen/NVPTX/shfl-sync-p.ll        |   80 +-
 llvm/test/CodeGen/NVPTX/shfl-sync.ll          |   40 +-
 llvm/test/CodeGen/NVPTX/shfl.ll               |   18 +-
 llvm/test/CodeGen/NVPTX/short-ptr.ll          |    6 +-
 .../CodeGen/NVPTX/shuffle-vec-undef-init.ll   |    6 +-
 llvm/test/CodeGen/NVPTX/st-addrspace.ll       |   72 +-
 llvm/test/CodeGen/NVPTX/st-generic.ll         |   24 +-
 llvm/test/CodeGen/NVPTX/st-param-imm.ll       |  294 +--
 llvm/test/CodeGen/NVPTX/st_bulk.ll            |   12 +-
 llvm/test/CodeGen/NVPTX/stacksaverestore.ll   |    6 +-
 llvm/test/CodeGen/NVPTX/store-retval.ll       |    6 +-
 llvm/test/CodeGen/NVPTX/store-undef.ll        |   56 +-
 llvm/test/CodeGen/NVPTX/surf-read-cuda.ll     |   14 +-
 llvm/test/CodeGen/NVPTX/surf-read.ll          |    2 +-
 llvm/test/CodeGen/NVPTX/surf-write-cuda.ll    |   10 +-
 llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll      |   24 +-
 llvm/test/CodeGen/NVPTX/tcgen05-commit.ll     |   24 +-
 llvm/test/CodeGen/NVPTX/tcgen05-cp.ll         |   72 +-
 llvm/test/CodeGen/NVPTX/tcgen05-ld.ll         |   20 +-
 llvm/test/CodeGen/NVPTX/tcgen05-shift.ll      |    2 +-
 llvm/test/CodeGen/NVPTX/tcgen05-st.ll         | 1308 +++++------
 llvm/test/CodeGen/NVPTX/tex-read-cuda.ll      |   22 +-
 llvm/test/CodeGen/NVPTX/tex-read.ll           |    2 +-
 llvm/test/CodeGen/NVPTX/texsurf-queries.ll    |    8 +-
 .../NVPTX/unaligned-param-load-store.ll       |   86 +-
 ...unfold-masked-merge-vector-variablemask.ll |  174 +-
 llvm/test/CodeGen/NVPTX/vaargs.ll             |   42 +-
 llvm/test/CodeGen/NVPTX/variadics-backend.ll  |   92 +-
 llvm/test/CodeGen/NVPTX/vec-param-load.ll     |   52 +-
 llvm/test/CodeGen/NVPTX/vec8.ll               |    6 +-
 llvm/test/CodeGen/NVPTX/vector-args.ll        |    8 +-
 llvm/test/CodeGen/NVPTX/vector-call.ll        |    6 +-
 llvm/test/CodeGen/NVPTX/vector-compare.ll     |    8 +-
 llvm/test/CodeGen/NVPTX/vector-loads.ll       |   34 +-
 llvm/test/CodeGen/NVPTX/vector-select.ll      |    8 +-
 llvm/test/CodeGen/NVPTX/vector-stores.ll      |    8 +-
 .../CodeGen/NVPTX/vectorize-misaligned.ll     |    8 +-
 189 files changed, 8554 insertions(+), 8582 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 295ed666a1902..e81448ff227be 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1077,21 +1077,6 @@ pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
   }
 }
 
-static int getLdStRegType(EVT VT) {
-  if (VT.isFloatingPoint())
-    switch (VT.getSimpleVT().SimpleTy) {
-    case MVT::f16:
-    case MVT::bf16:
-    case MVT::v2f16:
-    case MVT::v2bf16:
-      return NVPTX::PTXLdStInstCode::Untyped;
-    default:
-      return NVPTX::PTXLdStInstCode::Float;
-    }
-  else
-    return NVPTX::PTXLdStInstCode::Unsigned;
-}
-
 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   MemSDNode *LD = cast<MemSDNode>(N);
   assert(LD->readMem() && "Expected load");
@@ -1122,24 +1107,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   //          type is integer
   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
   MVT SimpleVT = LoadedVT.getSimpleVT();
-  MVT ScalarVT = SimpleVT.getScalarType();
   // Read at least 8 bits (predicates are stored as 8-bit values)
-  unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
-  unsigned int FromType;
+  unsigned FromTypeWidth = std::max(8U, (unsigned)SimpleVT.getSizeInBits());
 
   // Vector Setting
-  unsigned VecType = NVPTX::PTXLdStInstCode::Scalar;
-  if (SimpleVT.isVector()) {
-    assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
-           "Unexpected vector type");
-    // v2f16/v2bf16/v2i16 is loaded using ld.b32
-    FromTypeWidth = 32;
-  }
-
-  if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
-    FromType = NVPTX::PTXLdStInstCode::Signed;
-  else
-    FromType = getLdStRegType(ScalarVT);
+  unsigned int FromType =
+      (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
+          ? NVPTX::PTXLdStInstCode::Signed
+          : NVPTX::PTXLdStInstCode::Untyped;
 
   // Create the machine instruction DAG
   SDValue Offset, Base;
@@ -1147,7 +1122,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   SDValue Ops[] = {getI32Imm(Ordering, DL),
                    getI32Imm(Scope, DL),
                    getI32Imm(CodeAddrSpace, DL),
-                   getI32Imm(VecType, DL),
+                   getI32Imm(NVPTX::PTXLdStInstCode::Scalar, DL),
                    getI32Imm(FromType, DL),
                    getI32Imm(FromTypeWidth, DL),
                    Base,
@@ -1214,7 +1189,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   unsigned ExtensionType = N->getConstantOperandVal(N->getNumOperands() - 1);
   unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
                           ? NVPTX::PTXLdStInstCode::Signed
-                          : getLdStRegType(MemVT.getScalarType());
+                          : NVPTX::PTXLdStInstCode::Untyped;
 
   unsigned VecType;
   unsigned FromTypeWidth;
@@ -1232,8 +1207,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   }
 
   if (isSubVectorPackedInI32(EltVT)) {
+    assert(ExtensionType == ISD::NON_EXTLOAD);
     EltVT = MVT::i32;
-    FromType = NVPTX::PTXLdStInstCode::Untyped;
   }
 
   SDValue Offset, Base;
@@ -1434,21 +1409,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
 
   // Vector Setting
-  MVT SimpleVT = StoreVT.getSimpleVT();
-  unsigned VecType = NVPTX::PTXLdStInstCode::Scalar;
-
-  // Type Setting: toType + toTypeWidth
-  // - for integer type, always use 'u'
-  MVT ScalarVT = SimpleVT.getScalarType();
-  unsigned ToTypeWidth = ScalarVT.getSizeInBits();
-  if (SimpleVT.isVector()) {
-    assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
-           "Unexpected vector type");
-    // v2x16 is stored using st.b32
-    ToTypeWidth = 32;
-  }
-
-  unsigned int ToType = getLdStRegType(ScalarVT);
+  const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits();
 
   // Create the machine instruction DAG
   SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
@@ -1460,8 +1421,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
                    getI32Imm(Ordering, DL),
                    getI32Imm(Scope, DL),
                    getI32Imm(CodeAddrSpace, DL),
-                   getI32Imm(VecType, DL),
-                   getI32Imm(ToType, DL),
+                   getI32Imm(NVPTX::PTXLdStInstCode::Scalar, DL),
+                   getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
                    getI32Imm(ToTypeWidth, DL),
                    Base,
                    Offset,
@@ -1507,7 +1468,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   // Type Setting: toType + toTypeWidth
   // - for integer type, always use 'u'
   const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits();
-  unsigned ToType = getLdStRegType(StoreVT.getSimpleVT().getScalarType());
 
   SmallVector<SDValue, 12> Ops;
   SDValue N2;
@@ -1534,7 +1494,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
 
   if (isSubVectorPackedInI32(EltVT)) {
     EltVT = MVT::i32;
-    ToType = NVPTX::PTXLdStInstCode::Untyped;
   }
 
   SDValue Offset, Base;
@@ -1542,8 +1501,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
 
   Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
               getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
-              getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL), Base, Offset,
-              Chain});
+              getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
+              getI32Imm(ToTypeWidth, DL), Base, Offset, Chain});
 
   std::optional<unsigned> Opcode;
   switch (N->getOpcode()) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 043da14bcb236..21846583a8c04 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2296,11 +2296,11 @@ def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
 def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
 def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
 def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
-def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
-def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
-def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
-def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
-def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
+def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".b32">;
+def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".b64">;
+def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".b32">;
+def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".b64">;
+def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".b32">;
 
 defm StoreParamI64    : StoreParamInst<Int64Regs, i64imm, ".b64">;
 defm StoreParamI32    : StoreParamInst<Int32Regs, i32imm, ".b32">;
@@ -2319,13 +2319,13 @@ defm StoreParamV4I32  : StoreParamV4Inst<Int32Regs, i32imm, ".b32">;
 defm StoreParamV4I16  : StoreParamV4Inst<Int16Regs, i16imm, ".b16">;
 defm StoreParamV4I8   : StoreParamV4Inst<Int16Regs, i8imm,  ".b8">;
 
-defm StoreParamF32    : StoreParamInst<Float32Regs, f32imm, ".f32">;
-defm StoreParamF64    : StoreParamInst<Float64Regs, f64imm, ".f64">;
+defm StoreParamF32    : StoreParamInst<Float32Regs, f32imm, ".b32">;
+defm StoreParamF64    : StoreParamInst<Float64Regs, f64imm, ".b64">;
 
-defm StoreParamV2F32  : StoreParamV2Inst<Float32Regs, f32imm, ".f32">;
-defm StoreParamV2F64  : StoreParamV2Inst<Float64Regs, f64imm, ".f64">;
+defm StoreParamV2F32  : StoreParamV2Inst<Float32Regs, f32imm, ".b32">;
+defm StoreParamV2F64  : StoreParamV2Inst<Float64Regs, f64imm, ".b64">;
 
-defm StoreParamV4F32  : StoreParamV4Inst<Float32Regs, f32imm, ".f32">;
+defm StoreParamV4F32  : StoreParamV4Inst<Float32Regs, f32imm, ".b32">;
 
 def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
 def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
@@ -2341,11 +2341,11 @@ def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
 def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
 def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
 
-def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
-def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
-def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
-def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
-def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
+def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".b64">;
+def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".b32">;
+def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".b64">;
+def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".b32">;
+def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".b32">;
 
 def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
 def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 3eedb43e4c81a..4d56cf38531e7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2302,12 +2302,12 @@ class LDU_G<string TyStr, NVPTXRegClass regclass>
                "ldu.global." # TyStr # " \t$result, [$src];",
                       []>, Requires<[hasLDU]>;
 
-def INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8", Int16Regs>;
-def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16", Int16Regs>;
-def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32", Int32Regs>;
-def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64", Int64Regs>;
-def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32", Float32Regs>;
-def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64", Float64Regs>;
+def INT_PTX_LDU_GLOBAL_i8  : LDU_G<"b8", Int16Regs>;
+def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
+def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
+def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
+def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"b32", Float32Regs>;
+def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"b64", Float64Regs>;
 
 // vector
 
@@ -2324,19 +2324,19 @@ class VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass>
                "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
 
 
-def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"u8", Int16Regs>;
-def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"u16", Int16Regs>;
-def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"u32", Int32Regs>;
-def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"f32", Float32Regs>;
-def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"u64", Int64Regs>;
-def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"f64", Float64Regs>;
+def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"b8", Int16Regs>;
+def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"b16", Int16Regs>;
+def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"b32", Int32Regs>;
+def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"b32", Float32Regs>;
+def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"b64", Int64Regs>;
+def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"b64", Float64Regs>;
 
-def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"u8", Int16Regs>;
-def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"u16", Int16Regs>;
-def INT_PTX_LDU_G_v4i32_ELE  : VLDU_G_ELE_V4<"u32", Int32Regs>;
+def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"b8", Int16Regs>;
+def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
+def INT_PTX_LDU_G_v4i32_ELE  : VLDU_G_ELE_V4<"b32", Int32Regs>;
 def INT_PTX_LDU_G_v4f16_ELE   : VLDU_G_ELE_V4<"b16", Int16Regs>;
 def INT_PTX_LDU_G_v4f16x2_ELE  : VLDU_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v4f32_ELE  : VLDU_G_ELE_V4<"f32", Float32Regs>;
+def INT_PTX_LDU_G_v4f32_ELE  : VLDU_G_ELE_V4<"b32", Float32Regs>;
 
 
 //-----------------------------------
@@ -2352,12 +2352,12 @@ class LDG_G<string TyStr, NVPTXRegClass regclass>
                "ld.global.nc." # TyStr # " \t$result, [$src];",
                         []>, Requires<[hasLDG]>;
 
-def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"u8", Int16Regs>;
-def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"u16", Int16Regs>;
-def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"u32", Int32Regs>;
-def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"u64", Int64Regs>;
-def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"f32", Float32Regs>;
-def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"f64", Float64Regs>;
+def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"b8", Int16Regs>;
+def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"b16", Int16Regs>;
+def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"b32", Int32Regs>;
+def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"b64", Int64Regs>;
+def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"b32", Float32Regs>;
+def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"b64", Float64Regs>;
 
 // vector
 
@@ -2374,17 +2374,17 @@ class VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> :
             "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
 
 // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
-def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"u8", Int16Regs>;
-def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"u16", Int16Regs>;
-def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"u32", Int32Regs>;
-def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"f32", Float32Regs>;
-def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"u64", Int64Regs>;
-def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"f64", Float64Regs>;
-
-def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"u8", Int16Regs>;
-def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"u16", Int16Regs>;
-def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>;
-def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>;
+def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>;
+def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>;
+def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"b32", Int32Regs>;
+def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"b32", Float32Regs>;
+def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"b64", Int64Regs>;
+def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"b64", Float64Regs>;
+
+def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"b8", Int16Regs>;
+def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>;
+def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>;
+def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>;
 
 
 multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index 8f0964c2d5eba..78b57badc06e8 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -16,8 +16,8 @@ define i32 @f(ptr %p) {
 ; ENABLED-NEXT:    .reg .b64 %rd<2>;
 ; ENABLED-EMPTY:
 ; ENABLED-NEXT:  // %bb.0:
-; ENABLED-NEXT:    ld.param.u64 %rd1, [f_param_0];
-; ENABLED-NEXT:    ld.v2.u32 {%r1, %r2}, [%rd1];
+; ENABLED-NEXT:    ld.param.b64 %rd1, [f_param_0];
+; ENABLED-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
 ; ENABLED-NEXT:    add.s32 %r3, %r1, %r2;
 ; ENABLED-NEXT:    st.param.b32 [func_retval0], %r3;
 ; ENABLED-NEXT:    ret;
@@ -28,9 +28,9 @@ define i32 @f(ptr %p) {
 ; DISABLED-NEXT:    .reg .b64 %rd<2>;
 ; DISABLED-EMPTY:
 ; DISABLED-NEXT:  // %bb.0:
-; DISABLED-NEXT:    ld.param.u64 %rd1, [f_param_0];
-; DISABLED-NEXT:    ld.u32 %r1, [%rd1];
-; DISABLED-NEXT:    ld.u32 %r2, [%rd1+4];
+; DISABLED-NEXT:    ld.param.b64 %rd1, [f_param_0];
+; DISABLED-NEXT:    ld.b32 %r1, [%rd1];
+; DISABLED-NEXT:    ld.b32 %r2, [%rd1+4];
 ; DISABLED-NEXT:    add.s32 %r3, %r1, %r2;
 ; DISABLED-NEXT:    st.param.b32 [func_retval0], %r3;
 ; DISABLED-NEXT:    ret;
@@ -49,7 +49,7 @@ define half @fh(ptr %p) {
 ; ENABLED-NEXT:    .reg .b64 %rd<2>;
 ; ENABLED-EMPTY:
 ; ENABLED-NEXT:  // %bb.0:
-; ENABLED-NEXT:    ld.param.u64 %rd1, [fh_param_0];
+; ENABLED-NEXT:    ld.param.b64 %rd1, [fh_param_0];
 ; ENABLED-NEXT:    ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
 ; ENABLED-NEXT:    ld.b16 %rs5, [%rd1+8];
 ; ENABLED-NEXT:    cvt.f32.f16 %f1, %rs2;
@@ -78,7 +78,7 @@ define half @fh(ptr %p) {
 ; DISABLED-NEXT:    .reg .b64 %rd<2>;
 ; DISABLED-EMPTY:
 ; DISABLED-NEXT:  // %bb.0:
-; DISABLED-NEXT:    ld.param.u64 %rd1, [fh_param_0];
+; DISABLED-NEXT:    ld.param.b64 %rd1, [fh_param_0];
 ; DISABLED-NEXT:    ld.b16 %rs1, [%rd1];
 ; DISABLED-NEXT:    ld.b16 %rs2, [%rd1+2];
 ; DISABLED-NEXT:    ld.b16 %rs3, [%rd1+4];
@@ -125,14 +125,14 @@ define float @ff(ptr %p) {
 ; ENABLED-NEXT:    .reg .b64 %rd<2>;
 ; ENABLED-EMPTY:
 ; ENABLED-NEXT:  // %bb.0:
-; ENABLED-NEXT:    ld.param.u64 %rd1, [ff_param_0];
-; ENABLED-NEXT:    ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
-; ENABLED-NEXT:    ld.f32 %f5, [%rd1+16];
+; ENABLED-NEXT:    ld.param.b64 %rd1, [ff_param_0];
+; ENABLED-NEXT:    ld.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
+; ENABLED-NEXT:    ld.b32 %f5, [%rd1+16];
 ; ENABLED-NEXT:    add.rn.f32 %f6, %f1, %f2;
 ; ENABLED-NEXT:    add.rn.f32 %f7, %f3, %f4;
 ; ENABLED-NEXT:    add.rn.f32 %f8, %f6, %f7;
 ; ENABLED-NEXT:    add.rn.f32 %f9, %f8, %f5;
-; ENABLED-NEXT:    st.param.f32 [func_retval0], %f9;
+; ENABLED-NEXT:    st.param.b32 [func_retval0], %f9;
 ; ENABLED-NEXT:    ret;
 ;
 ; DISABLED-LABEL: ff(
@@ -141,17 +141,17 @@ define float @ff(ptr %p) {
 ; DISABLED-NEXT:    .reg .b64 %rd<2>;
 ; DISABLED-EMPTY:
 ; DISABLED-NEXT:  // %bb.0:
-; DISABLED-NEXT:    ld.param.u64 %rd1, [ff_param_0];
-; DISABLED-NEXT:    ld.f32 %f1, [%rd1];
-; DISABLED-NEXT:    ld.f32 %f2, [%rd1+4];
-; DISABLED-NEXT:    ld.f32 %f3, [%rd1+8];
-; DISABLED-NEXT:    ld.f32 %f4, [%rd1+12];
-; DISABLED-NEXT:    ld.f32 %f5, [%rd1+16];
+; DISABLED-NEXT:    ld.param.b64 %rd1, [ff_param_0];
+; DISABLED-NEXT:    ld.b32 %f1, [%rd1];
+; DISABLED-NEXT:    ld.b32 %f2, [%rd1+4];
+; DISABLED-NEXT:    ld.b32 %f3, [%rd1+8];
+; DISABLED-NEXT:    ld.b32 %f4, [%rd1+12];
+; DISABLED-NEXT:    ld.b32 %f5, [%rd1+16];
 ; DISABLED-NEXT:    add.rn.f32 %f6, %f1, %f2;
 ; DISABLED-NEXT:    add.rn.f32 %f7, %f3, %f4;
 ; DISABLED-NEXT:    add.rn.f32 %f8, %f6, %f7;
 ; DISABLED-NEXT:    add.rn.f32 %f9, %f8, %f5;
-; DISABLED-NEXT:    st.param.f32 [func_retval0], %f9;
+; DISABLED-NEXT:    st.param.b32 [func_retval0], %f9;
 ; DISABLED-NEXT:    ret;
   %p.1 = getelementptr float, ptr %p, i32 1
   %p.2 = getelementptr float, ptr %p, i32 2
@@ -176,9 +176,9 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
 ; ENABLED-NEXT:    .reg .b64 %rd<3>;
 ; ENABLED-EMPTY:
 ; ENABLED-NEXT:  // %bb.0:
-; ENABLED-NEXT:    ld.param.u64 %rd1, [combine_v16i8_param_0];
+; ENABLED-NEXT:    ld.param.b64 %rd1, [combine_v16i8_param_0];
 ; ENABLED-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; ENABLED-NEXT:    ld.param.u64 %rd2, [combine_v16i8_param_1];
+; ENABLED-NEXT:    ld.param.b64 %rd2, [combine_v16i8_param_1];
 ; ENABLED-NEXT:    bfe.u32 %r5, %r1, 0, 8;
 ; ENABLED-NEXT:    bfe.u32 %r6, %r1, 8, 8;
 ; ENABLED-NEXT:    bfe.u32 %r7, %r1, 16, 8;
@@ -210,7 +210,7 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
 ; ENABLED-NEXT:    add.s32 %r33, %r32, %r18;
 ; ENABLED-NEXT:    add.s32 %r34, %r33, %r19;
 ; ENABLED-NEXT:    add.s32 %r35, %r34, %r20;
-; ENABLED-NEXT:    st.u32 [%rd2], %r35;
+; ENABLED-NEXT:    st.b32 [%rd2], %r35;
 ; ENABLED-NEXT:    ret;
 ;
 ; DISABLED-LABEL: combine_v16i8(
@@ -219,24 +219,24 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
 ; DISABLED-NEXT:    .reg .b64 %rd<3>;
 ; DISABLED-EMPTY:
 ; DISABLED-NEXT:  // %bb.0:
-; DISABLED-NEXT:    ld.param.u64 %rd1, [combine_v16i8_param_0];
-; DISABLED-NEXT:    ld.u8 %r1, [%rd1];
-; DISABLED-NEXT:    ld.param.u64 %rd2, [combine_v16i8_param_1];
-; DISABLED-NEXT:    ld.u8 %r2, [%rd1+1];
-; DISABLED-NEXT:    ld.u8 %r3, [%rd1+2];
-; DISABLED-NEXT:    ld.u8 %r4, [%rd1+3];
-; DISABLED-NEXT:    ld.u8 %r5, [%rd1+4];
-; DISABLED-NEXT:    ld.u8 %r6, [%rd1+5];
-; DISABLED-NEXT:    ld.u8 %r7, [%rd1+6];
-; DISABLED-NEXT:    ld.u8 %r8, [%rd1+7];
-; DISABLED-NEXT:    ld.u8 %r9, [%rd1+8];
-; DISABLED-NEXT:    ld.u8 %r10, [%rd1+9];
-; DISABLED-NEXT:    ld.u8 %r11, [%rd1+10];
-; DISABLED-NEXT:    ld.u8 %r12, [%rd1+11];
-; DISABLED-NEXT:    ld.u8 %r13, [%rd1+12];
-; DISABLED-NEXT:    ld.u8 %r14, [%rd1+13];
-; DISABLED-NEXT:    ld.u8 %r15, [%rd1+14];
-; DISABLED-NEXT:    ld.u8 %r16, [%rd1+15];
+; DISABLED-NEXT:    ld.param.b64 %rd1, [combine_v16i8_param_0];
+; DISABLED-NEXT:    ld.b8 %r1, [%rd1];
+; DISABLED-NEXT:    ld.param.b64 %rd2, [combine_v16i8_param_1];
+; DISABLED-NEXT:    ld.b8 %r2, [%rd1+1];
+; DISABLED-NEXT:    ld.b8 %r3, [%rd1+2];
+; DISABLED-NEXT:    ld.b8 %r4, [%rd1+3];
+; DISABLED-NEXT:    ld.b8 %r5, [%rd1+4];
+; DISABLED-NEXT:    ld.b8 %r6, [%rd1+5];
+; DISABLED-NEXT:    ld.b8 %r7, [%rd1+6];
+; DISABLED-NEXT:    ld.b8 %r8, [%rd1+7];
+; DISABLED-NEXT:    ld.b8 %r9, [%rd1+8];
+; DISABLED-NEXT:    ld.b8 %r10, [%rd1+9];
+; DISABLED-NEXT:    ld.b8 %r11, [%rd1+10];
+; DISABLED-NEXT:    ld.b8 %r12, [%rd1+11];
+; DISABLED-NEXT:    ld.b8 %r13, [%rd1+12];
+; DISABLED-NEXT:    ld.b8 %r14, [%rd1+13];
+; DISABLED-NEXT:    ld.b8 %r15, [%rd1+14];
+; DISABLED-NEXT:    ld.b8 %r16, [%rd1+15];
 ; DISABLED-NEXT:    add.s32 %r17, %r1, %r2;
 ; DISABLED-NEXT:    add.s32 %r18, %r17, %r3;
 ; DISABLED-NEXT:    add.s32 %r19, %r18, %r4;
@@ -252,7 +252,7 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
 ; DISABLED-NEXT:    add.s32 %r29, %r28, %r14;
 ; DISABLED-NEXT:    add.s32 %r30, %r29, %r15;
 ; DISABLED-NEXT:    add.s32 %r31, %r30, %r16;
-; DISABLED-NEXT:    st.u32 [%rd2], %r31;
+; DISABLED-NEXT:    st.b32 [%rd2], %r31;
 ; DISABLED-NEXT:    ret;
   %val0 = load i8, ptr %ptr1, align 16
   %ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1
@@ -327,9 +327,9 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
 ; ENABLED-NEXT:    .reg .b64 %rd<3>;
 ; ENABLED-EMPTY:
 ; ENABLED-NEXT:  // %bb.0:
-; ENABLED-NEXT:    ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0];
+; ENABLED-NEXT:    ld.param.b64 %rd1, [combine_v16i8_unaligned_param_0];
 ; ENABLED-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
-; ENABLED-NEXT:    ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1];
+; ENABLED-NEXT:    ld.param.b64 %rd2, [combine_v16i8_unaligned_param_1];
 ; ENABLED-NEXT:    ld.v2.b32 {%r3, %r4}, [%rd1+8];
 ; ENABLED-NEXT:    bfe.u32 %r5, %r1, 0, 8;
 ; ENABLED-NEXT:    bfe.u32 %r6, %r1, 8, 8;
@@ -362,7 +362,7 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
 ; ENABLED-NEXT:    add.s32 %r33, %r32, %r18;
 ; ENABLED-NEXT:    add.s32 %r34, %r33, %r19;
 ; ENABLED-NEXT:    add.s32 %r35, %r34, %r20;
-; ENABLED-NEXT:    st.u32 [%rd2], %r35;
+; ENABLED-NEXT:    st.b32 [%rd2], %r35;
 ; ENABLED-NEXT:    ret;
 ;
 ; DISABLED-LABEL: combine_v16i8_unaligned(
@@ -371,24 +371,24 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
 ; DISABLED-NEXT:    .reg .b64 %rd<3>;
 ; DISABLED-EMPTY:
 ; DISABLED-NEXT:  // %bb.0:
-; DISABLED-NEXT:    ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0];
-; DISABLED-NEXT:    ld.u8 %r1, [%rd1];
-; DISABLED-NEXT:    ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1];
-; DISABLED-NEXT:    ld.u8 %r2, [%rd1+1];
-; DISABLED-NEXT:    ld.u8 %r3, [%rd1+2];
-; DISABLED-NEXT:    ld.u8 %r4, [%rd1+3];
-; DISABLED-NEXT:    ld.u8 %r5, [%rd1+4];
-; DISABLED-NEXT:    ld.u8 %r6, [%rd1+5];
-; DISABLED-NEXT:    ld.u8 %r7, [%rd1+6];
-; DISABLED-NEXT:    ld.u8 %r8, [%rd1+7];
-; DISABLED-NEXT:    ld.u8 %r9, [%rd1+8];
-; DISABLED-NEXT:    ld.u8 %r10, [%rd1+9];
-; DISABLED-NEXT:    ld.u8 %r11, [%rd1+10];
-; DISABLED-NEXT:    ld.u8 %r12, [%rd1+11];
-; DISABLED-NEXT:    ld.u8 %r13, [%rd1+12];
-; DISABLED-NEXT:    ld.u8 %r14, [%rd1+13];
-; DISABLED-NEXT:    ld.u8 %r15, [%rd1+14];
-; DISABLED-NEXT:    ld.u8 %r16, [%rd1+15];
+; DISABLED-NEXT:    ld.param.b64 %rd1, [combine_v16i8_unaligned_param_0];
+; DISABLED-NEXT:    ld.b8 %r1, [%rd1];
+; DISABLED-NEXT:    ld.param.b64 %rd2, [combine_v16i8_unaligned_param_1];
+; DISABLED-NEXT:    ld.b8 %r2, [%rd1+1];
+; DISABLED-NEXT:    ld.b8 %r3, [%rd1+2];
+; DISABLED-NEXT:    ld.b8 %r4, [%rd1+3];
+; DISABLED-NEXT:    ld.b8 %r5, [%rd1+4];
+; DISABLED-NEXT:    ld.b8 %r6, [%rd1+5];
+; DISABLED-NEXT:    ld.b8 %r7, [%rd1+6];
+; DISABLED-NEXT:    ld.b8 %r8, [%rd1+7];
+; DISABLED-NEXT:    ld.b8 %r9, [%rd1+8];
+; DISABLED-NEXT:    ld.b8 %r10, [%rd1+9];
+; DISABLED-NEXT:    ld.b8 %r11, [%rd1+10];
+; DISABLED-NEXT:    ld.b8 %r12, [%rd1+11];
+; DISABLED-NEXT:    ld.b8 %r13, [%rd1+12];
+; DISABLED-NEXT:    ld.b8 %r14, [%rd1+13];
+; DISABLED-NEXT:    ld.b8 %r15, [%rd1+14];
+; DISABLED-NEXT:    ld.b8 %r16, [%rd1+15];
 ; DISABLED-NEXT:    add.s32 %r17, %r1, %r2;
 ; DISABLED-NEXT:    add.s32 %r18, %r17, %r3;
 ; DISABLED-NEXT:    add.s32 %r19, %r18, %r4;
@@ -404,7 +404,7 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
 ; DISABLED-NEXT:    add.s32 %r29, %r28, %r14;
 ; DISABLED-NEXT:    add.s32 %r30, %r29, %r15;
 ; DISABLED-NEXT:    add.s32 %r31, %r30, %r16;
-; DISABLED-NEXT:    st.u32 [%rd2], %r31;
+; DISABLED-NEXT:    st.b32 [%rd2], %r31;
 ; DISABLED-NEXT:    ret;
   %val0 = load i8, ptr %ptr1, align 8
   %ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1
@@ -481,13 +481,13 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
 ; ENABLED-NEXT:    .reg .b64 %rd<3>;
 ; ENABLED-EMPTY:
 ; ENABLED-NEXT:  // %bb.0:
-; ENABLED-NEXT:    ld.param.u64 %rd1, [combine_v8i16_param_0];
+; ENABLED-NEXT:    ld.param.b64 %rd1, [combine_v8i16_param_0];
 ; ENABLED-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; ENABLED-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; ENABLED-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
 ; ENABLED-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
 ; ENABLED-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; ENABLED-NEXT:    ld.param.u64 %rd2, [combine_v8i16_param_1];
+; ENABLED-NEXT:    ld.param.b64 %rd2, [combine_v8i16_param_1];
 ; ENABLED-NEXT:    cvt.u32.u16 %r5, %rs7;
 ; ENABLED-NEXT:    cvt.u32.u16 %r6, %rs8;
 ; ENABLED-NEXT:    cvt.u32.u16 %r7, %rs5;
@@ -503,7 +503,7 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
 ; ENABLED-NEXT:    add.s32 %r17, %r16, %r10;
 ; ENABLED-NEXT:    add.s32 %r18, %r17, %r11;
 ; ENABLED-NEXT:    add.s32 %r19, %r18, %r12;
-; ENABLED-NEXT:    st.u32 [%rd2], %r19;
+; ENABLED-NEXT:    st.b32 [%rd2], %r19;
 ; ENABLED-NEXT:    ret;
 ;
 ; DISABLED-LABEL: combine_v8i16(
@@ -512,16 +512,16 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
 ; DISABLED-NEXT:    .reg .b64 %rd<3>;
 ; DISABLED-EMPTY:
 ; DISABLED-NEXT:  // %bb.0:
-; DISABLED-NEXT:    ld.param.u64 %rd1, [combine_v8i16_param_0];
-; DISABLED-NEXT:    ld.u16 %r1, [%rd1];
-; DISABLED-NEXT:    ld.param.u64 %rd2, [combine_v8i16_param_1];
-; DISABLED-NEXT:    ld.u16 %r2, [%rd1+2];
-; DISABLED-NEXT:    ld.u16 %r3, [%rd1+4];
-; DISABLED-NEXT:    ld.u16 %r4, [%rd1+6];
-; DISABLED-NEXT:    ld.u16 %r5, [%rd1+8];
-; DISABLED-NEXT:    ld.u16 %r6, [%rd1+10];
-; DISABLED-NEXT:    ld.u16 %r7, [%rd1+12];
-; DISABLED-NEXT:    ld.u16 %r8, [%rd1+14];
+; DISABLED-NEXT:    ld.param.b64 %rd1, [combine_v8i16_param_0];
+; DISABLED-NEXT:    ld.b16 %r1, [%rd1];
+; DISABLED-NEXT:    ld.param.b64 %rd2, [combine_v8i16_param_1];
+; DISABLED-NEXT:    ld.b16 %r2, [%rd1+2];
+; DISABLED-NEXT:    ld.b16 %r3, [%rd1+4];
+; DISABLED-NEXT:    ld.b16 %r4, [%rd1+6];
+; DISABLED-NEXT:    ld.b16 %r5, [%rd1+8];
+; DISABLED-NEXT:    ld.b16 %r6, [%rd1+10];
+; DISABLED-NEXT:    ld.b16 %r7, [%rd1+12];
+; DISABLED-NEXT:    ld.b16 %r8, [%rd1+14];
 ; DISABLED-NEXT:    add.s32 %r9, %r1, %r2;
 ; DISABLED-NEXT:    add.s32 %r10, %r9, %r3;
 ; DISABLED-NEXT:    add.s32 %r11, %r10, %r4;
@@ -529,7 +529,7 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
 ; DISABLED-NEXT:    add.s32 %r13, %r12, %r6;
 ; DISABLED-NEXT:    add.s32 %r14, %r13, %r7;
 ; DISABLED-NEXT:    add.s32 %r15, %r14, %r8;
-; DISABLED-NEXT:    st.u32 [%rd2], %r15;
+; DISABLED-NEXT:    st.b32 [%rd2], %r15;
 ; DISABLED-NEXT:    ret;
   %val0 = load i16, ptr %ptr1, align 16
   %ptr1.1 = getelementptr inbounds i16, ptr %ptr1, i64 1
@@ -572,13 +572,13 @@ define void @combine_v4i32(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
 ; ENABLED-NEXT:    .reg .b64 %rd<3>;
 ; ENABLED-EMPTY:
 ; ENABLED-NEXT:  // %bb.0:
-; ENABLED-NEXT:    ld.param.u64 %rd1, [combine_v4i32_param_0];
-; ENABLED-NEXT:    ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
-; ENABLED-NEXT:    ld.param.u64 %rd2, [combine_v4i32_param_1];
+; ENABLED-NEXT:    ld.param.b64 %rd1, [combine_v4i32_param_0];
+; ENABLED-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; ENABLED-NEXT:    ld.param.b64 %rd2, [combine_v4i32_param_1];
 ; ENABLED-NEXT:    add.s32 %r5, %r1, %r2;
 ; ENABLED-NEXT:    add.s32 %r6, %r5, %r3;
 ; ENABLED-NEXT:    add.s32 %r7, %r6, %r4;
-; ENABLED-NEXT:    st.u32 [%rd2], %r7;
+; ENABLED-NEXT:    st.b32 [%rd2], %r7;
 ; ENABLED-NEXT:    ret;
 ;
 ; DISABLED-LABEL: combine_v4i32(
@@ -587,16 +587,16 @@ define void @combine_v4i32(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
 ; DISABLED-NEXT:    .reg .b64 %rd<3>;
 ; DISABLED-EMPTY:
 ; DISABLED-NEXT:  // %bb.0:
-; DISABLED-NEXT:    ld.param.u64 %rd1, [combine_v4i32_param_0];
-; DISABLED-NEXT:    ld.u32 %r1, [%rd1];
-; DISABLED-NEXT:    ld.param.u64 %rd2, [combine_v4i32_param_1];
-; DISABLED-NEXT:    ld.u32 %r2, [%rd1+4];
-; DISABLED-NEXT:    ld.u32 %r3, [%rd1+8];
-; DISABLED-NEXT:    ld.u32 %r4, [%rd1+12];
+; DISABLED-NEXT:    ld.param.b64 %rd1, [combine_v4i32_param_0];
+; DISABLED-NEXT:    ld.b32 %r1, [%rd1];
+; DISABLED-NEXT:    ld.param.b64 %rd2, [combine_v4i32_param_1];
+; DISABLED-NEXT:    ld.b32 %r2, [%rd1+4];
+; DISABLED-NEXT:    ld.b32 %r3, [%rd1+8];
+; DISABLED-NEXT:    ld.b32 %r4, [%rd1+12];
 ; DISABLED-NEXT:    add.s32 %r5, %r1, %r2;
 ; DISABLED-NEXT:    add.s32 %r6, %r5, %r3;
 ; DISABLED-NEXT:    add.s32 %r7, %r6, %r4;
-; DISABLED-NEXT:    st.u32 [%rd2], %r7;
+; DISABLED-NEXT:    st.b32 [%rd2], %r7;
 ; DISABLED-NEXT:    ret;
   %val0 = load i32, ptr %ptr1, align 16
   %ptr1.1 = getelementptr inbounds i32, ptr %ptr1, i64 1
diff --git a/llvm/test/CodeGen/NVPTX/MachineSink-call.ll b/llvm/test/CodeGen/NVPTX/MachineSink-call.ll
index ee2535f16fc86..aeb4a50e96f8a 100644
--- a/llvm/test/CodeGen/NVPTX/MachineSink-call.ll
+++ b/llvm/test/CodeGen/NVPTX/MachineSink-call.ll
@@ -10,7 +10,7 @@ declare void @foo()
 ; the call may modify memory.
 define i32 @f(i32 %x, ptr %ptr, i1 %cond) {
 Start:
-  ; CHECK: ld.u32
+  ; CHECK: ld.b32
   %ptr_val = load i32, ptr %ptr
   ; CHECK: call.uni
   call void @foo()
diff --git a/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll b/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll
index 222f147a7d46a..43085cf718bfc 100644
--- a/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll
+++ b/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll
@@ -10,7 +10,7 @@ declare void @llvm.nvvm.barrier0()
 ; syncthreads is modeled as maystore.
 define i32 @f(i32 %x, ptr %ptr, i1 %cond) {
 Start:
-  ; CHECK: ld.u32
+  ; CHECK: ld.b32
   %ptr_val = load i32, ptr %ptr
   ; CHECK: bar.sync
   call void @llvm.nvvm.barrier0()
diff --git a/llvm/test/CodeGen/NVPTX/access-non-generic.ll b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
index 86d3f33a0421c..a816f2e84b064 100644
--- a/llvm/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/access-non-generic.ll
@@ -23,10 +23,10 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
   ; load cast
   %1 = load float, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4
   call void @use(float %1)
-; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
+; PTX: ld.shared.b32 %f{{[0-9]+}}, [scalar];
   ; store cast
   store float %v, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4
-; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
+; PTX: st.shared.b32 [scalar], %f{{[0-9]+}};
   ; use syncthreads to disable optimizations across components
   call void @llvm.nvvm.barrier0()
 ; PTX: bar.sync 0;
@@ -35,20 +35,20 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
   %2 = addrspacecast ptr addrspace(3) @scalar to ptr
   %3 = load float, ptr %2, align 4
   call void @use(float %3)
-; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
+; PTX: ld.shared.b32 %f{{[0-9]+}}, [scalar];
   ; cast; store
   store float %v, ptr %2, align 4
-; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
+; PTX: st.shared.b32 [scalar], %f{{[0-9]+}};
   call void @llvm.nvvm.barrier0()
 ; PTX: bar.sync 0;
 
   ; load gep cast
   %4 = load float, ptr getelementptr inbounds ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i32 0, i32 5), align 4
   call void @use(float %4)
-; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
+; PTX: ld.shared.b32 %f{{[0-9]+}}, [array+20];
   ; store gep cast
   store float %v, ptr getelementptr inbounds ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i32 0, i32 5), align 4
-; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
+; PTX: st.shared.b32 [array+20], %f{{[0-9]+}};
   call void @llvm.nvvm.barrier0()
 ; PTX: bar.sync 0;
 
@@ -56,10 +56,10 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
   %5 = getelementptr inbounds [10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i32 0, i32 5
   %6 = load float, ptr %5, align 4
   call void @use(float %6)
-; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
+; PTX: ld.shared.b32 %f{{[0-9]+}}, [array+20];
   ; gep cast; store
   store float %v, ptr %5, align 4
-; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
+; PTX: st.shared.b32 [array+20], %f{{[0-9]+}};
   call void @llvm.nvvm.barrier0()
 ; PTX: bar.sync 0;
 
@@ -68,10 +68,10 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
   %8 = getelementptr inbounds [10 x float], ptr %7, i32 0, i32 %i
   %9 = load float, ptr %8, align 4
   call void @use(float %9)
-; PTX: ld.shared.f32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}];
+; PTX: ld.shared.b32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}];
   ; cast; gep; store
   store float %v, ptr %8, align 4
-; PTX: st.shared.f32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}};
+; PTX: st.shared.b32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}};
   call void @llvm.nvvm.barrier0()
 ; PTX: bar.sync 0;
 
@@ -84,7 +84,7 @@ define i32 @ld_int_from_float() {
 ; IR-LABEL: @ld_int_from_float
 ; IR: load i32, ptr addrspace(3) @scalar
 ; PTX-LABEL: ld_int_from_float(
-; PTX: ld.shared.u{{(32|64)}}
+; PTX: ld.shared.b{{(32|64)}}
   %1 = load i32, ptr addrspacecast(ptr addrspace(3) @scalar to ptr), align 4
   ret i32 %1
 }
@@ -108,7 +108,7 @@ define void @nested_const_expr() {
   ; store 1 to bitcast(gep(addrspacecast(array), 0, 1))
   store i32 1, ptr getelementptr ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i64 0, i64 1), align 4
 ; PTX: mov.b32 %r1, 1;
-; PTX-NEXT: st.shared.u32 [array+4], %r1;
+; PTX-NEXT: st.shared.b32 [array+4], %r1;
   ret void
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/addr-mode.ll b/llvm/test/CodeGen/NVPTX/addr-mode.ll
index ab8fab6c8a3fe..7b02872bfb619 100644
--- a/llvm/test/CodeGen/NVPTX/addr-mode.ll
+++ b/llvm/test/CodeGen/NVPTX/addr-mode.ll
@@ -10,8 +10,8 @@ define i32 @test_addr_mode_i64(ptr %x) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_addr_mode_i64_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1+-4];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_addr_mode_i64_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1+-4];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %addr = getelementptr i32, ptr %x, i64 -1
@@ -26,8 +26,8 @@ define i32 @test_addr_mode_i32(ptr %x) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_addr_mode_i32_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1+-4];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_addr_mode_i32_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1+-4];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %addr = getelementptr i32, ptr %x, i32 -1
@@ -42,8 +42,8 @@ define i32 @test_addr_mode_i16(ptr %x) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_addr_mode_i16_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1+-4];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_addr_mode_i16_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1+-4];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %addr = getelementptr i32, ptr %x, i16 -1
@@ -58,8 +58,8 @@ define i32 @test_addr_mode_i8(ptr %x) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_addr_mode_i8_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1+-4];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_addr_mode_i8_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1+-4];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %addr = getelementptr i32, ptr %x, i8 -1
@@ -74,9 +74,9 @@ define i32 @test_addr_mode_i64_large(ptr %x) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_addr_mode_i64_large_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_addr_mode_i64_large_param_0];
 ; CHECK-NEXT:    add.s64 %rd2, %rd1, 17179869172;
-; CHECK-NEXT:    ld.u32 %r1, [%rd2];
+; CHECK-NEXT:    ld.b32 %r1, [%rd2];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %addr = getelementptr i32, ptr %x, i64 4294967293
diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast-folding.ll b/llvm/test/CodeGen/NVPTX/addrspacecast-folding.ll
index 87698c1c9644b..b3e5cbe09a096 100644
--- a/llvm/test/CodeGen/NVPTX/addrspacecast-folding.ll
+++ b/llvm/test/CodeGen/NVPTX/addrspacecast-folding.ll
@@ -10,7 +10,7 @@ define ptr @test1(ptr %p) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test1_param_0];
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd1;
 ; CHECK-NEXT:    ret;
   %a = addrspacecast ptr %p to ptr addrspace(5)
@@ -24,7 +24,7 @@ define ptr addrspace(1) @test2(ptr addrspace(5) %p) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test2_param_0];
 ; CHECK-NEXT:    cvta.local.u64 %rd2, %rd1;
 ; CHECK-NEXT:    cvta.to.global.u64 %rd3, %rd2;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll b/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll
index 01326db9a8b16..00b17896d2c9e 100644
--- a/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll
+++ b/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll
@@ -13,9 +13,9 @@ define i32 @conv_shared_cluster_to_generic(ptr addrspace(7) %ptr) {
 ; NOPTRCONV-NEXT:    .reg .b64 %rd<3>;
 ; NOPTRCONV-EMPTY:
 ; NOPTRCONV-NEXT:  // %bb.0:
-; NOPTRCONV-NEXT:    ld.param.u64 %rd1, [conv_shared_cluster_to_generic_param_0];
+; NOPTRCONV-NEXT:    ld.param.b64 %rd1, [conv_shared_cluster_to_generic_param_0];
 ; NOPTRCONV-NEXT:    cvta.shared::cluster.u64 %rd2, %rd1;
-; NOPTRCONV-NEXT:    ld.u32 %r1, [%rd2];
+; NOPTRCONV-NEXT:    ld.b32 %r1, [%rd2];
 ; NOPTRCONV-NEXT:    st.param.b32 [func_retval0], %r1;
 ; NOPTRCONV-NEXT:    ret;
 ;
@@ -25,10 +25,10 @@ define i32 @conv_shared_cluster_to_generic(ptr addrspace(7) %ptr) {
 ; PTRCONV-NEXT:    .reg .b64 %rd<3>;
 ; PTRCONV-EMPTY:
 ; PTRCONV-NEXT:  // %bb.0:
-; PTRCONV-NEXT:    ld.param.u32 %r1, [conv_shared_cluster_to_generic_param_0];
+; PTRCONV-NEXT:    ld.param.b32 %r1, [conv_shared_cluster_to_generic_param_0];
 ; PTRCONV-NEXT:    cvt.u64.u32 %rd1, %r1;
 ; PTRCONV-NEXT:    cvta.shared::cluster.u64 %rd2, %rd1;
-; PTRCONV-NEXT:    ld.u32 %r2, [%rd2];
+; PTRCONV-NEXT:    ld.b32 %r2, [%rd2];
 ; PTRCONV-NEXT:    st.param.b32 [func_retval0], %r2;
 ; PTRCONV-NEXT:    ret;
   %genptr = addrspacecast ptr addrspace(7) %ptr to ptr
@@ -45,9 +45,9 @@ define i32 @conv_generic_to_shared_cluster(ptr %ptr) {
 ; NOPTRCONV-NEXT:    .reg .b64 %rd<3>;
 ; NOPTRCONV-EMPTY:
 ; NOPTRCONV-NEXT:  // %bb.0:
-; NOPTRCONV-NEXT:    ld.param.u64 %rd1, [conv_generic_to_shared_cluster_param_0];
+; NOPTRCONV-NEXT:    ld.param.b64 %rd1, [conv_generic_to_shared_cluster_param_0];
 ; NOPTRCONV-NEXT:    cvta.to.shared::cluster.u64 %rd2, %rd1;
-; NOPTRCONV-NEXT:    ld.shared::cluster.u32 %r1, [%rd2];
+; NOPTRCONV-NEXT:    ld.shared::cluster.b32 %r1, [%rd2];
 ; NOPTRCONV-NEXT:    st.param.b32 [func_retval0], %r1;
 ; NOPTRCONV-NEXT:    ret;
 ;
@@ -57,10 +57,10 @@ define i32 @conv_generic_to_shared_cluster(ptr %ptr) {
 ; PTRCONV-NEXT:    .reg .b64 %rd<3>;
 ; PTRCONV-EMPTY:
 ; PTRCONV-NEXT:  // %bb.0:
-; PTRCONV-NEXT:    ld.param.u64 %rd1, [conv_generic_to_shared_cluster_param_0];
+; PTRCONV-NEXT:    ld.param.b64 %rd1, [conv_generic_to_shared_cluster_param_0];
 ; PTRCONV-NEXT:    cvta.to.shared::cluster.u64 %rd2, %rd1;
 ; PTRCONV-NEXT:    cvt.u32.u64 %r1, %rd2;
-; PTRCONV-NEXT:    ld.shared::cluster.u32 %r2, [%r1];
+; PTRCONV-NEXT:    ld.shared::cluster.b32 %r2, [%r1];
 ; PTRCONV-NEXT:    st.param.b32 [func_retval0], %r2;
 ; PTRCONV-NEXT:    ret;
   %specptr = addrspacecast ptr %ptr to ptr addrspace(7)
@@ -76,10 +76,10 @@ define i32 @conv_shared_to_shared_cluster(ptr addrspace(3) %ptr) {
 ; NOPTRCONV-NEXT:    .reg .b64 %rd<4>;
 ; NOPTRCONV-EMPTY:
 ; NOPTRCONV-NEXT:  // %bb.0:
-; NOPTRCONV-NEXT:    ld.param.u64 %rd1, [conv_shared_to_shared_cluster_param_0];
+; NOPTRCONV-NEXT:    ld.param.b64 %rd1, [conv_shared_to_shared_cluster_param_0];
 ; NOPTRCONV-NEXT:    cvta.shared.u64 %rd2, %rd1;
 ; NOPTRCONV-NEXT:    cvta.to.shared::cluster.u64 %rd3, %rd2;
-; NOPTRCONV-NEXT:    ld.shared::cluster.u32 %r1, [%rd3];
+; NOPTRCONV-NEXT:    ld.shared::cluster.b32 %r1, [%rd3];
 ; NOPTRCONV-NEXT:    st.param.b32 [func_retval0], %r1;
 ; NOPTRCONV-NEXT:    ret;
 ;
@@ -89,12 +89,12 @@ define i32 @conv_shared_to_shared_cluster(ptr addrspace(3) %ptr) {
 ; PTRCONV-NEXT:    .reg .b64 %rd<4>;
 ; PTRCONV-EMPTY:
 ; PTRCONV-NEXT:  // %bb.0:
-; PTRCONV-NEXT:    ld.param.u32 %r1, [conv_shared_to_shared_cluster_param_0];
+; PTRCONV-NEXT:    ld.param.b32 %r1, [conv_shared_to_shared_cluster_param_0];
 ; PTRCONV-NEXT:    cvt.u64.u32 %rd1, %r1;
 ; PTRCONV-NEXT:    cvta.shared.u64 %rd2, %rd1;
 ; PTRCONV-NEXT:    cvta.to.shared::cluster.u64 %rd3, %rd2;
 ; PTRCONV-NEXT:    cvt.u32.u64 %r2, %rd3;
-; PTRCONV-NEXT:    ld.shared::cluster.u32 %r3, [%r2];
+; PTRCONV-NEXT:    ld.shared::cluster.b32 %r3, [%r2];
 ; PTRCONV-NEXT:    st.param.b32 [func_retval0], %r3;
 ; PTRCONV-NEXT:    ret;
   %specptr = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(7)
@@ -110,10 +110,10 @@ define i32 @conv_shared_cluster_to_shared(ptr addrspace(7) %ptr) {
 ; NOPTRCONV-NEXT:    .reg .b64 %rd<4>;
 ; NOPTRCONV-EMPTY:
 ; NOPTRCONV-NEXT:  // %bb.0:
-; NOPTRCONV-NEXT:    ld.param.u64 %rd1, [conv_shared_cluster_to_shared_param_0];
+; NOPTRCONV-NEXT:    ld.param.b64 %rd1, [conv_shared_cluster_to_shared_param_0];
 ; NOPTRCONV-NEXT:    cvta.shared::cluster.u64 %rd2, %rd1;
 ; NOPTRCONV-NEXT:    cvta.to.shared.u64 %rd3, %rd2;
-; NOPTRCONV-NEXT:    ld.shared.u32 %r1, [%rd3];
+; NOPTRCONV-NEXT:    ld.shared.b32 %r1, [%rd3];
 ; NOPTRCONV-NEXT:    st.param.b32 [func_retval0], %r1;
 ; NOPTRCONV-NEXT:    ret;
 ;
@@ -123,12 +123,12 @@ define i32 @conv_shared_cluster_to_shared(ptr addrspace(7) %ptr) {
 ; PTRCONV-NEXT:    .reg .b64 %rd<4>;
 ; PTRCONV-EMPTY:
 ; PTRCONV-NEXT:  // %bb.0:
-; PTRCONV-NEXT:    ld.param.u32 %r1, [conv_shared_cluster_to_shared_param_0];
+; PTRCONV-NEXT:    ld.param.b32 %r1, [conv_shared_cluster_to_shared_param_0];
 ; PTRCONV-NEXT:    cvt.u64.u32 %rd1, %r1;
 ; PTRCONV-NEXT:    cvta.shared::cluster.u64 %rd2, %rd1;
 ; PTRCONV-NEXT:    cvta.to.shared.u64 %rd3, %rd2;
 ; PTRCONV-NEXT:    cvt.u32.u64 %r2, %rd3;
-; PTRCONV-NEXT:    ld.shared.u32 %r3, [%r2];
+; PTRCONV-NEXT:    ld.shared.b32 %r3, [%r2];
 ; PTRCONV-NEXT:    st.param.b32 [func_retval0], %r3;
 ; PTRCONV-NEXT:    ret;
   %specptr = addrspacecast ptr addrspace(7) %ptr to ptr addrspace(3)
diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast.ll b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
index 0aa66d1fc45f3..86008a1b70058 100644
--- a/llvm/test/CodeGen/NVPTX/addrspacecast.ll
+++ b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
@@ -10,7 +10,7 @@ define i32 @conv1(ptr addrspace(1) %ptr) {
 ; CLS32: cvta.global.u32
 ; ALL-NOT: cvt.u64.u32
 ; CLS64: cvta.global.u64
-; ALL: ld.u32
+; ALL: ld.b32
   %genptr = addrspacecast ptr addrspace(1) %ptr to ptr
   %val = load i32, ptr %genptr
   ret i32 %val
@@ -22,7 +22,7 @@ define i32 @conv2(ptr addrspace(3) %ptr) {
 ; PTRCONV: cvt.u64.u32
 ; NOPTRCONV-NOT: cvt.u64.u32
 ; CLS64: cvta.shared.u64
-; ALL: ld.u32
+; ALL: ld.b32
   %genptr = addrspacecast ptr addrspace(3) %ptr to ptr
   %val = load i32, ptr %genptr
   ret i32 %val
@@ -34,7 +34,7 @@ define i32 @conv3(ptr addrspace(4) %ptr) {
 ; PTRCONV: cvt.u64.u32
 ; NOPTRCONV-NOT: cvt.u64.u32
 ; CLS64: cvta.const.u64
-; ALL: ld.u32
+; ALL: ld.b32
   %genptr = addrspacecast ptr addrspace(4) %ptr to ptr
   %val = load i32, ptr %genptr
   ret i32 %val
@@ -46,7 +46,7 @@ define i32 @conv4(ptr addrspace(5) %ptr) {
 ; PTRCONV: cvt.u64.u32
 ; NOPTRCONV-NOT: cvt.u64.u32
 ; CLS64: cvta.local.u64
-; ALL: ld.u32
+; ALL: ld.b32
   %genptr = addrspacecast ptr addrspace(5) %ptr to ptr
   %val = load i32, ptr %genptr
   ret i32 %val
@@ -57,7 +57,7 @@ define i32 @conv5(ptr %ptr) {
 ; CLS32: cvta.to.global.u32
 ; ALL-NOT: cvt.u64.u32
 ; CLS64: cvta.to.global.u64
-; ALL: ld.global.u32
+; ALL: ld.global.b32
   %specptr = addrspacecast ptr %ptr to ptr addrspace(1)
   %val = load i32, ptr addrspace(1) %specptr
   ret i32 %val
@@ -69,7 +69,7 @@ define i32 @conv6(ptr %ptr) {
 ; CLS64: cvta.to.shared.u64
 ; PTRCONV: cvt.u32.u64
 ; NOPTRCONV-NOT: cvt.u32.u64
-; ALL: ld.shared.u32
+; ALL: ld.shared.b32
   %specptr = addrspacecast ptr %ptr to ptr addrspace(3)
   %val = load i32, ptr addrspace(3) %specptr
   ret i32 %val
@@ -81,7 +81,7 @@ define i32 @conv7(ptr %ptr) {
 ; CLS64: cvta.to.const.u64
 ; PTRCONV: cvt.u32.u64
 ; NOPTRCONV-NOT: cvt.u32.u64
-; ALL: ld.const.u32
+; ALL: ld.const.b32
   %specptr = addrspacecast ptr %ptr to ptr addrspace(4)
   %val = load i32, ptr addrspace(4) %specptr
   ret i32 %val
@@ -93,7 +93,7 @@ define i32 @conv8(ptr %ptr) {
 ; CLS64: cvta.to.local.u64
 ; PTRCONV: cvt.u32.u64
 ; NOPTRCONV-NOT: cvt.u32.u64
-; ALL: ld.local.u32
+; ALL: ld.local.b32
   %specptr = addrspacecast ptr %ptr to ptr addrspace(5)
   %val = load i32, ptr addrspace(5) %specptr
   ret i32 %val
@@ -104,7 +104,7 @@ define i32 @conv9(ptr addrspace(1) %ptr) {
 ; CLS32:     // implicit-def: %[[ADDR:r[0-9]+]]
 ; PTRCONV:   // implicit-def: %[[ADDR:r[0-9]+]]
 ; NOPTRCONV: // implicit-def: %[[ADDR:rd[0-9]+]]
-; ALL: ld.shared.u32 %r{{[0-9]+}}, [%[[ADDR]]]
+; ALL: ld.shared.b32 %r{{[0-9]+}}, [%[[ADDR]]]
   %specptr = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(3)
   %val = load i32, ptr addrspace(3) %specptr
   ret i32 %val
@@ -120,8 +120,8 @@ define void @split1To0(ptr nocapture noundef readonly %xs) {
 ; CLS32: cvta.global.u32
 ; CLS64: cvta.global.u64
 ; CLS64: cvta.global.u64
-; ALL: st.u32
-; ALL: st.u32
+; ALL: st.b32
+; ALL: st.b32
   %vec_addr = load <2 x ptr addrspace(1)>, ptr %xs, align 16
   %addrspacecast = addrspacecast <2 x ptr addrspace(1)> %vec_addr to <2 x ptr>
   %extractelement0 = extractelement <2 x ptr> %addrspacecast, i64 0
@@ -139,8 +139,8 @@ define void @split0To1(ptr nocapture noundef readonly %xs) {
 ; CLS32: cvta.to.global.u32
 ; CLS64: cvta.to.global.u64
 ; CLS64: cvta.to.global.u64
-; ALL: st.global.u32
-; ALL: st.global.u32
+; ALL: st.global.b32
+; ALL: st.global.b32
   %vec_addr = load <2 x ptr>, ptr %xs, align 16
   %addrspacecast = addrspacecast <2 x ptr> %vec_addr to <2 x ptr addrspace(1)>
   %extractelement0 = extractelement <2 x ptr addrspace(1)> %addrspacecast, i64 0
@@ -162,9 +162,9 @@ define void @widen1To0(ptr nocapture noundef readonly %xs) {
 ; CLS64: cvta.global.u64
 ; CLS64: cvta.global.u64
 
-; ALL: st.u32
-; ALL: st.u32
-; ALL: st.u32
+; ALL: st.b32
+; ALL: st.b32
+; ALL: st.b32
   %vec_addr = load <3 x ptr addrspace(1)>, ptr %xs, align 16
   %addrspacecast = addrspacecast <3 x ptr addrspace(1)> %vec_addr to <3 x ptr>
   %extractelement0 = extractelement <3 x ptr> %addrspacecast, i64 0
@@ -188,9 +188,9 @@ define void @widen0To1(ptr nocapture noundef readonly %xs) {
 ; CLS64: cvta.to.global.u64
 ; CLS64: cvta.to.global.u64
 
-; ALL: st.global.u32
-; ALL: st.global.u32
-; ALL: st.global.u32
+; ALL: st.global.b32
+; ALL: st.global.b32
+; ALL: st.global.b32
   %vec_addr = load <3 x ptr>, ptr %xs, align 16
   %addrspacecast = addrspacecast <3 x ptr> %vec_addr to <3 x ptr addrspace(1)>
   %extractelement0 = extractelement <3 x ptr addrspace(1)> %addrspacecast, i64 0
diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
index cda7d38ccb0b7..72c302433f081 100644
--- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll
+++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
@@ -10,9 +10,9 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
 ; CHECK-LABEL: @test_v2f32
   %call = tail call <2 x float> @barv(<2 x float> %input)
 ; CHECK: .param .align 8 .b8 retval0[8];
-; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0];
+; CHECK: ld.param.v2.b32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0];
   store <2 x float> %call, ptr %output, align 8
-; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]}
+; CHECK: st.v2.b32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]}
   ret void
 }
 
@@ -21,15 +21,15 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
 ;
   %call = tail call <3 x float> @barv3(<3 x float> %input)
 ; CHECK: .param .align 16 .b8 retval0[16];
-; CHECK-DAG: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0];
-; CHECK-DAG: ld.param.f32 [[E2:%f[0-9]+]], [retval0+8];
+; CHECK-DAG: ld.param.v2.b32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0];
+; CHECK-DAG: ld.param.b32 [[E2:%f[0-9]+]], [retval0+8];
 ; Make sure we don't load more values than than we need to.
-; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12];
+; CHECK-NOT: ld.param.b32 [[E3:%f[0-9]+]], [retval0+12];
   store <3 x float> %call, ptr %output, align 8
-; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8],
+; CHECK-DAG: st.b32 [{{%rd[0-9]}}+8],
 ; -- This is suboptimal. We should do st.v2.f32 instead
 ;    of combining 2xf32 info i64.
-; CHECK-DAG: st.u64 [{{%rd[0-9]}}],
+; CHECK-DAG: st.b64 [{{%rd[0-9]}}],
 ; CHECK: ret;
   ret void
 }
@@ -38,12 +38,12 @@ define void @test_a2f32([2 x float] %input, ptr %output) {
 ; CHECK-LABEL: @test_a2f32
   %call = tail call [2 x float] @bara([2 x float] %input)
 ; CHECK: .param .align 4 .b8 retval0[8];
-; CHECK-DAG: ld.param.f32 [[ELEMA1:%f[0-9]+]], [retval0];
-; CHECK-DAG: ld.param.f32 [[ELEMA2:%f[0-9]+]], [retval0+4];
+; CHECK-DAG: ld.param.b32 [[ELEMA1:%f[0-9]+]], [retval0];
+; CHECK-DAG: ld.param.b32 [[ELEMA2:%f[0-9]+]], [retval0+4];
   store [2 x float] %call, ptr %output, align 4
 ; CHECK: }
-; CHECK-DAG: st.f32 [{{%rd[0-9]+}}], [[ELEMA1]]
-; CHECK-DAG: st.f32 [{{%rd[0-9]+}}+4], [[ELEMA2]]
+; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMA1]]
+; CHECK-DAG: st.b32 [{{%rd[0-9]+}}+4], [[ELEMA2]]
   ret void
 ; CHECK: ret
 }
@@ -52,12 +52,12 @@ define void @test_s2f32({float, float} %input, ptr %output) {
 ; CHECK-LABEL: @test_s2f32
   %call = tail call {float, float} @bars({float, float} %input)
 ; CHECK: .param .align 4 .b8 retval0[8];
-; CHECK-DAG: ld.param.f32 [[ELEMS1:%f[0-9]+]], [retval0];
-; CHECK-DAG: ld.param.f32 [[ELEMS2:%f[0-9]+]], [retval0+4];
+; CHECK-DAG: ld.param.b32 [[ELEMS1:%f[0-9]+]], [retval0];
+; CHECK-DAG: ld.param.b32 [[ELEMS2:%f[0-9]+]], [retval0+4];
   store {float, float} %call, ptr %output, align 4
 ; CHECK: }
-; CHECK-DAG: st.f32 [{{%rd[0-9]+}}], [[ELEMS1]]
-; CHECK-DAG: st.f32 [{{%rd[0-9]+}}+4], [[ELEMS2]]
+; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMS1]]
+; CHECK-DAG: st.b32 [{{%rd[0-9]+}}+4], [[ELEMS2]]
   ret void
 ; CHECK: ret
 }
diff --git a/llvm/test/CodeGen/NVPTX/and-or-setcc.ll b/llvm/test/CodeGen/NVPTX/and-or-setcc.ll
index 5949de335b8cf..53c741bd6cb2c 100644
--- a/llvm/test/CodeGen/NVPTX/and-or-setcc.ll
+++ b/llvm/test/CodeGen/NVPTX/and-or-setcc.ll
@@ -12,8 +12,8 @@ define i1 @and_ord(float %a, float %b) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [and_ord_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [and_ord_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [and_ord_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [and_ord_param_1];
 ; CHECK-NEXT:    setp.num.f32 %p1, %f1, %f2;
 ; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -32,8 +32,8 @@ define i1 @or_uno(float %a, float %b) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [or_uno_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [or_uno_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [or_uno_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [or_uno_param_1];
 ; CHECK-NEXT:    setp.nan.f32 %p1, %f1, %f2;
 ; CHECK-NEXT:    selp.b32 %r1, 1, 0, %p1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
diff --git a/llvm/test/CodeGen/NVPTX/anonymous-fn-param.ll b/llvm/test/CodeGen/NVPTX/anonymous-fn-param.ll
index 9d7bd4a3ed6c2..713ceb2d7d57c 100644
--- a/llvm/test/CodeGen/NVPTX/anonymous-fn-param.ll
+++ b/llvm/test/CodeGen/NVPTX/anonymous-fn-param.ll
@@ -6,7 +6,7 @@
 
 ; CHECK:      .func (.param .b32 func_retval0) __unnamed_1(
 ; CHECK-NEXT: .param .b32 __unnamed_1_param_0
-; CHECK:      ld.param.u32 {{%r[0-9]+}}, [__unnamed_1_param_0];
+; CHECK:      ld.param.b32 {{%r[0-9]+}}, [__unnamed_1_param_0];
 
 define internal i32 @0(i32 %a) {
 entry:
@@ -16,7 +16,7 @@ entry:
 
 ; CHECK:      .func (.param .b32 func_retval0) __unnamed_2(
 ; CHECK-NEXT: .param .b32 __unnamed_2_param_0
-; CHECK:      ld.param.u32 {{%r[0-9]+}}, [__unnamed_2_param_0];
+; CHECK:      ld.param.b32 {{%r[0-9]+}}, [__unnamed_2_param_0];
 
 define internal i32 @1(i32 %a) {
 entry:
diff --git a/llvm/test/CodeGen/NVPTX/applypriority.ll b/llvm/test/CodeGen/NVPTX/applypriority.ll
index af161d82a25ea..23b1bda9a32bf 100644
--- a/llvm/test/CodeGen/NVPTX/applypriority.ll
+++ b/llvm/test/CodeGen/NVPTX/applypriority.ll
@@ -13,7 +13,7 @@ define void @applypriority_global_L2(ptr addrspace(1) %global_ptr, i64 %size) {
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [applypriority_global_L2_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [applypriority_global_L2_param_0];
 ; CHECK-PTX64-NEXT:    applypriority.global.L2::evict_normal [%rd1], 128;
 ; CHECK-PTX64-NEXT:    ret;
   tail call void @llvm.nvvm.applypriority.global.L2.evict.normal(ptr addrspace(1) %global_ptr, i64 128)
@@ -26,7 +26,7 @@ define void @applypriority_L2(ptr %ptr, i64 %size) {
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [applypriority_L2_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [applypriority_L2_param_0];
 ; CHECK-PTX64-NEXT:    applypriority.L2::evict_normal [%rd1], 128;
 ; CHECK-PTX64-NEXT:    ret;
   tail call void @llvm.nvvm.applypriority.L2.evict.normal(ptr %ptr, i64 128)
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
index b14295020bc0e..22a7177650ee2 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -15,13 +15,13 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_param_0];
 ; CHECK-NEXT:    ld.param.b16 %rs1, [test_param_3];
 ; CHECK-NEXT:    atom.add.noftz.f16 %rs2, [%r1], %rs1;
-; CHECK-NEXT:    ld.param.u32 %r2, [test_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_param_1];
 ; CHECK-NEXT:    mov.b16 %rs3, 0x3C00;
 ; CHECK-NEXT:    atom.add.noftz.f16 %rs4, [%r1], %rs3;
-; CHECK-NEXT:    ld.param.u32 %r3, [test_param_2];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_param_2];
 ; CHECK-NEXT:    atom.global.add.noftz.f16 %rs5, [%r2], %rs1;
 ; CHECK-NEXT:    atom.shared.add.noftz.f16 %rs6, [%r3], %rs1;
 ; CHECK-NEXT:    ret;
@@ -32,13 +32,13 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECK64-NEXT:    .reg .b64 %rd<4>;
 ; CHECK64-EMPTY:
 ; CHECK64-NEXT:  // %bb.0:
-; CHECK64-NEXT:    ld.param.u64 %rd1, [test_param_0];
+; CHECK64-NEXT:    ld.param.b64 %rd1, [test_param_0];
 ; CHECK64-NEXT:    ld.param.b16 %rs1, [test_param_3];
 ; CHECK64-NEXT:    atom.add.noftz.f16 %rs2, [%rd1], %rs1;
-; CHECK64-NEXT:    ld.param.u64 %rd2, [test_param_1];
+; CHECK64-NEXT:    ld.param.b64 %rd2, [test_param_1];
 ; CHECK64-NEXT:    mov.b16 %rs3, 0x3C00;
 ; CHECK64-NEXT:    atom.add.noftz.f16 %rs4, [%rd1], %rs3;
-; CHECK64-NEXT:    ld.param.u64 %rd3, [test_param_2];
+; CHECK64-NEXT:    ld.param.b64 %rd3, [test_param_2];
 ; CHECK64-NEXT:    atom.global.add.noftz.f16 %rs5, [%rd2], %rs1;
 ; CHECK64-NEXT:    atom.shared.add.noftz.f16 %rs6, [%rd3], %rs1;
 ; CHECK64-NEXT:    ret;
@@ -51,16 +51,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-EMPTY:
 ; CHECKPTX62-NEXT:  // %bb.0:
 ; CHECKPTX62-NEXT:    ld.param.b16 %rs1, [test_param_3];
-; CHECKPTX62-NEXT:    ld.param.u32 %r23, [test_param_2];
-; CHECKPTX62-NEXT:    ld.param.u32 %r22, [test_param_1];
-; CHECKPTX62-NEXT:    ld.param.u32 %r24, [test_param_0];
+; CHECKPTX62-NEXT:    ld.param.b32 %r23, [test_param_2];
+; CHECKPTX62-NEXT:    ld.param.b32 %r22, [test_param_1];
+; CHECKPTX62-NEXT:    ld.param.b32 %r24, [test_param_0];
 ; CHECKPTX62-NEXT:    and.b32 %r1, %r24, -4;
 ; CHECKPTX62-NEXT:    and.b32 %r25, %r24, 3;
 ; CHECKPTX62-NEXT:    shl.b32 %r2, %r25, 3;
 ; CHECKPTX62-NEXT:    mov.b32 %r26, 65535;
 ; CHECKPTX62-NEXT:    shl.b32 %r27, %r26, %r2;
 ; CHECKPTX62-NEXT:    not.b32 %r3, %r27;
-; CHECKPTX62-NEXT:    ld.u32 %r54, [%r1];
+; CHECKPTX62-NEXT:    ld.b32 %r54, [%r1];
 ; CHECKPTX62-NEXT:  $L__BB0_1: // %atomicrmw.start45
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX62-NEXT:    shr.u32 %r28, %r54, %r2;
@@ -75,7 +75,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    mov.b32 %r54, %r6;
 ; CHECKPTX62-NEXT:    @%p1 bra $L__BB0_1;
 ; CHECKPTX62-NEXT:  // %bb.2: // %atomicrmw.end44
-; CHECKPTX62-NEXT:    ld.u32 %r55, [%r1];
+; CHECKPTX62-NEXT:    ld.b32 %r55, [%r1];
 ; CHECKPTX62-NEXT:  $L__BB0_3: // %atomicrmw.start27
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX62-NEXT:    shr.u32 %r33, %r55, %r2;
@@ -97,7 +97,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    mov.b32 %r39, 65535;
 ; CHECKPTX62-NEXT:    shl.b32 %r40, %r39, %r11;
 ; CHECKPTX62-NEXT:    not.b32 %r12, %r40;
-; CHECKPTX62-NEXT:    ld.global.u32 %r56, [%r10];
+; CHECKPTX62-NEXT:    ld.global.b32 %r56, [%r10];
 ; CHECKPTX62-NEXT:  $L__BB0_5: // %atomicrmw.start9
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX62-NEXT:    shr.u32 %r41, %r56, %r11;
@@ -118,7 +118,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    mov.b32 %r47, 65535;
 ; CHECKPTX62-NEXT:    shl.b32 %r48, %r47, %r17;
 ; CHECKPTX62-NEXT:    not.b32 %r18, %r48;
-; CHECKPTX62-NEXT:    ld.shared.u32 %r57, [%r16];
+; CHECKPTX62-NEXT:    ld.shared.b32 %r57, [%r16];
 ; CHECKPTX62-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX62-NEXT:    shr.u32 %r49, %r57, %r17;
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index f27e574724ce4..b5a4f94611453 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -15,13 +15,13 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_param_0];
 ; CHECK-NEXT:    ld.param.b16 %rs1, [test_param_3];
 ; CHECK-NEXT:    atom.add.noftz.bf16 %rs2, [%r1], %rs1;
-; CHECK-NEXT:    ld.param.u32 %r2, [test_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_param_1];
 ; CHECK-NEXT:    mov.b16 %rs3, 0x3F80;
 ; CHECK-NEXT:    atom.add.noftz.bf16 %rs4, [%r1], %rs3;
-; CHECK-NEXT:    ld.param.u32 %r3, [test_param_2];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_param_2];
 ; CHECK-NEXT:    atom.global.add.noftz.bf16 %rs5, [%r2], %rs1;
 ; CHECK-NEXT:    atom.shared.add.noftz.bf16 %rs6, [%r3], %rs1;
 ; CHECK-NEXT:    ret;
@@ -32,13 +32,13 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECK64-NEXT:    .reg .b64 %rd<4>;
 ; CHECK64-EMPTY:
 ; CHECK64-NEXT:  // %bb.0:
-; CHECK64-NEXT:    ld.param.u64 %rd1, [test_param_0];
+; CHECK64-NEXT:    ld.param.b64 %rd1, [test_param_0];
 ; CHECK64-NEXT:    ld.param.b16 %rs1, [test_param_3];
 ; CHECK64-NEXT:    atom.add.noftz.bf16 %rs2, [%rd1], %rs1;
-; CHECK64-NEXT:    ld.param.u64 %rd2, [test_param_1];
+; CHECK64-NEXT:    ld.param.b64 %rd2, [test_param_1];
 ; CHECK64-NEXT:    mov.b16 %rs3, 0x3F80;
 ; CHECK64-NEXT:    atom.add.noftz.bf16 %rs4, [%rd1], %rs3;
-; CHECK64-NEXT:    ld.param.u64 %rd3, [test_param_2];
+; CHECK64-NEXT:    ld.param.b64 %rd3, [test_param_2];
 ; CHECK64-NEXT:    atom.global.add.noftz.bf16 %rs5, [%rd2], %rs1;
 ; CHECK64-NEXT:    atom.shared.add.noftz.bf16 %rs6, [%rd3], %rs1;
 ; CHECK64-NEXT:    ret;
@@ -51,16 +51,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-EMPTY:
 ; CHECKPTX71-NEXT:  // %bb.0:
 ; CHECKPTX71-NEXT:    ld.param.b16 %rs1, [test_param_3];
-; CHECKPTX71-NEXT:    ld.param.u32 %r23, [test_param_2];
-; CHECKPTX71-NEXT:    ld.param.u32 %r22, [test_param_1];
-; CHECKPTX71-NEXT:    ld.param.u32 %r24, [test_param_0];
+; CHECKPTX71-NEXT:    ld.param.b32 %r23, [test_param_2];
+; CHECKPTX71-NEXT:    ld.param.b32 %r22, [test_param_1];
+; CHECKPTX71-NEXT:    ld.param.b32 %r24, [test_param_0];
 ; CHECKPTX71-NEXT:    and.b32 %r1, %r24, -4;
 ; CHECKPTX71-NEXT:    and.b32 %r25, %r24, 3;
 ; CHECKPTX71-NEXT:    shl.b32 %r2, %r25, 3;
 ; CHECKPTX71-NEXT:    mov.b32 %r26, 65535;
 ; CHECKPTX71-NEXT:    shl.b32 %r27, %r26, %r2;
 ; CHECKPTX71-NEXT:    not.b32 %r3, %r27;
-; CHECKPTX71-NEXT:    ld.u32 %r54, [%r1];
+; CHECKPTX71-NEXT:    ld.b32 %r54, [%r1];
 ; CHECKPTX71-NEXT:  $L__BB0_1: // %atomicrmw.start45
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX71-NEXT:    shr.u32 %r28, %r54, %r2;
@@ -76,7 +76,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    mov.b32 %r54, %r6;
 ; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
 ; CHECKPTX71-NEXT:  // %bb.2: // %atomicrmw.end44
-; CHECKPTX71-NEXT:    ld.u32 %r55, [%r1];
+; CHECKPTX71-NEXT:    ld.b32 %r55, [%r1];
 ; CHECKPTX71-NEXT:  $L__BB0_3: // %atomicrmw.start27
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX71-NEXT:    shr.u32 %r33, %r55, %r2;
@@ -98,7 +98,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    mov.b32 %r39, 65535;
 ; CHECKPTX71-NEXT:    shl.b32 %r40, %r39, %r11;
 ; CHECKPTX71-NEXT:    not.b32 %r12, %r40;
-; CHECKPTX71-NEXT:    ld.global.u32 %r56, [%r10];
+; CHECKPTX71-NEXT:    ld.global.b32 %r56, [%r10];
 ; CHECKPTX71-NEXT:  $L__BB0_5: // %atomicrmw.start9
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX71-NEXT:    shr.u32 %r41, %r56, %r11;
@@ -120,7 +120,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    mov.b32 %r47, 65535;
 ; CHECKPTX71-NEXT:    shl.b32 %r48, %r47, %r17;
 ; CHECKPTX71-NEXT:    not.b32 %r18, %r48;
-; CHECKPTX71-NEXT:    ld.shared.u32 %r57, [%r16];
+; CHECKPTX71-NEXT:    ld.shared.b32 %r57, [%r16];
 ; CHECKPTX71-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX71-NEXT:    shr.u32 %r49, %r57, %r17;
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index 16de80d55a054..6c5af3da5d9b2 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -11,8 +11,8 @@ define i32 @atom0(ptr %addr, i32 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom0_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [atom0_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom0_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [atom0_param_1];
 ; CHECK-NEXT:    atom.add.u32 %r2, [%rd1], %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -27,8 +27,8 @@ define i64 @atom1(ptr %addr, i64 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom1_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [atom1_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [atom1_param_1];
 ; CHECK-NEXT:    atom.add.u64 %rd3, [%rd1], %rd2;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
@@ -44,8 +44,8 @@ define i32 @atom2(ptr %subr, i32 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom2_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [atom2_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [atom2_param_1];
 ; CHECK-NEXT:    neg.s32 %r2, %r1;
 ; CHECK-NEXT:    atom.add.u32 %r3, [%rd1], %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -61,8 +61,8 @@ define i64 @atom3(ptr %subr, i64 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom3_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [atom3_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom3_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [atom3_param_1];
 ; CHECK-NEXT:    neg.s64 %rd3, %rd2;
 ; CHECK-NEXT:    atom.add.u64 %rd4, [%rd1], %rd3;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
@@ -79,8 +79,8 @@ define i32 @atom4(ptr %subr, i32 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom4_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [atom4_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom4_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [atom4_param_1];
 ; CHECK-NEXT:    atom.and.b32 %r2, [%rd1], %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -95,8 +95,8 @@ define i64 @atom5(ptr %subr, i64 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom5_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [atom5_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom5_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [atom5_param_1];
 ; CHECK-NEXT:    atom.and.b64 %rd3, [%rd1], %rd2;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
@@ -123,8 +123,8 @@ define i32 @atom8(ptr %subr, i32 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom8_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [atom8_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom8_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [atom8_param_1];
 ; CHECK-NEXT:    atom.or.b32 %r2, [%rd1], %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -139,8 +139,8 @@ define i64 @atom9(ptr %subr, i64 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom9_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [atom9_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom9_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [atom9_param_1];
 ; CHECK-NEXT:    atom.or.b64 %rd3, [%rd1], %rd2;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
@@ -156,8 +156,8 @@ define i32 @atom10(ptr %subr, i32 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom10_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [atom10_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom10_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [atom10_param_1];
 ; CHECK-NEXT:    atom.xor.b32 %r2, [%rd1], %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -172,8 +172,8 @@ define i64 @atom11(ptr %subr, i64 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom11_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [atom11_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom11_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [atom11_param_1];
 ; CHECK-NEXT:    atom.xor.b64 %rd3, [%rd1], %rd2;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
@@ -189,8 +189,8 @@ define i32 @atom12(ptr %subr, i32 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom12_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [atom12_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom12_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [atom12_param_1];
 ; CHECK-NEXT:    atom.max.s32 %r2, [%rd1], %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -205,8 +205,8 @@ define i64 @atom13(ptr %subr, i64 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom13_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [atom13_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom13_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [atom13_param_1];
 ; CHECK-NEXT:    atom.max.s64 %rd3, [%rd1], %rd2;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
@@ -222,8 +222,8 @@ define i32 @atom14(ptr %subr, i32 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom14_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [atom14_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom14_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [atom14_param_1];
 ; CHECK-NEXT:    atom.min.s32 %r2, [%rd1], %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -238,8 +238,8 @@ define i64 @atom15(ptr %subr, i64 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom15_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [atom15_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom15_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [atom15_param_1];
 ; CHECK-NEXT:    atom.min.s64 %rd3, [%rd1], %rd2;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
@@ -255,8 +255,8 @@ define i32 @atom16(ptr %subr, i32 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom16_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [atom16_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom16_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [atom16_param_1];
 ; CHECK-NEXT:    atom.max.u32 %r2, [%rd1], %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -271,8 +271,8 @@ define i64 @atom17(ptr %subr, i64 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom17_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [atom17_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom17_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [atom17_param_1];
 ; CHECK-NEXT:    atom.max.u64 %rd3, [%rd1], %rd2;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
@@ -288,8 +288,8 @@ define i32 @atom18(ptr %subr, i32 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom18_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [atom18_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom18_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [atom18_param_1];
 ; CHECK-NEXT:    atom.min.u32 %r2, [%rd1], %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -304,8 +304,8 @@ define i64 @atom19(ptr %subr, i64 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom19_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [atom19_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom19_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [atom19_param_1];
 ; CHECK-NEXT:    atom.min.u64 %rd3, [%rd1], %rd2;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
@@ -320,8 +320,8 @@ define i32 @atom20(ptr %subr, i32 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom20_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [atom20_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom20_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [atom20_param_1];
 ; CHECK-NEXT:    atom.inc.u32 %r2, [%rd1], %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -336,8 +336,8 @@ define i32 @atom21(ptr %subr, i32 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atom21_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [atom21_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atom21_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [atom21_param_1];
 ; CHECK-NEXT:    atom.dec.u32 %r2, [%rd1], %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -355,10 +355,10 @@ define float @atomic_add_f32_generic(ptr %addr, float %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atomic_add_f32_generic_param_0];
-; CHECK-NEXT:    ld.param.f32 %f1, [atomic_add_f32_generic_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atomic_add_f32_generic_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [atomic_add_f32_generic_param_1];
 ; CHECK-NEXT:    atom.add.f32 %f2, [%rd1], %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %ret = call float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val)
   ret float %ret
@@ -374,10 +374,10 @@ define float @atomic_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atomic_add_f32_addrspace1_param_0];
-; CHECK-NEXT:    ld.param.f32 %f1, [atomic_add_f32_addrspace1_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atomic_add_f32_addrspace1_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [atomic_add_f32_addrspace1_param_1];
 ; CHECK-NEXT:    atom.global.add.f32 %f2, [%rd1], %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %ret = call float @llvm.nvvm.atomic.load.add.f32.p1(ptr addrspace(1) %addr, float %val)
   ret float %ret
@@ -393,10 +393,10 @@ define float @atomic_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atomic_add_f32_addrspace3_param_0];
-; CHECK-NEXT:    ld.param.f32 %f1, [atomic_add_f32_addrspace3_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atomic_add_f32_addrspace3_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [atomic_add_f32_addrspace3_param_1];
 ; CHECK-NEXT:    atom.shared.add.f32 %f2, [%rd1], %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %ret = call float @llvm.nvvm.atomic.load.add.f32.p3(ptr addrspace(3) %addr, float %val)
   ret float %ret
@@ -410,10 +410,10 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atomicrmw_add_f32_generic_param_0];
-; CHECK-NEXT:    ld.param.f32 %f1, [atomicrmw_add_f32_generic_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atomicrmw_add_f32_generic_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [atomicrmw_add_f32_generic_param_1];
 ; CHECK-NEXT:    atom.add.f32 %f2, [%rd1], %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw fadd ptr %addr, float %val seq_cst
   ret float %ret
@@ -431,7 +431,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [atomicrmw_add_f16_generic_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd2, [atomicrmw_add_f16_generic_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [atomicrmw_add_f16_generic_param_0];
 ; CHECK-NEXT:    and.b64 %rd1, %rd2, -4;
 ; CHECK-NEXT:    cvt.u32.u64 %r6, %rd2;
 ; CHECK-NEXT:    and.b32 %r7, %r6, 3;
@@ -439,7 +439,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
 ; CHECK-NEXT:    mov.b32 %r8, 65535;
 ; CHECK-NEXT:    shl.b32 %r9, %r8, %r1;
 ; CHECK-NEXT:    not.b32 %r2, %r9;
-; CHECK-NEXT:    ld.u32 %r16, [%rd1];
+; CHECK-NEXT:    ld.b32 %r16, [%rd1];
 ; CHECK-NEXT:    cvt.f32.f16 %f2, %rs1;
 ; CHECK-NEXT:  $L__BB24_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -474,10 +474,10 @@ define float @atomicrmw_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atomicrmw_add_f32_addrspace1_param_0];
-; CHECK-NEXT:    ld.param.f32 %f1, [atomicrmw_add_f32_addrspace1_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atomicrmw_add_f32_addrspace1_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [atomicrmw_add_f32_addrspace1_param_1];
 ; CHECK-NEXT:    atom.global.add.f32 %f2, [%rd1], %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw fadd ptr addrspace(1) %addr, float %val seq_cst
   ret float %ret
@@ -491,10 +491,10 @@ define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atomicrmw_add_f32_addrspace3_param_0];
-; CHECK-NEXT:    ld.param.f32 %f1, [atomicrmw_add_f32_addrspace3_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atomicrmw_add_f32_addrspace3_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [atomicrmw_add_f32_addrspace3_param_1];
 ; CHECK-NEXT:    atom.shared.add.f32 %f2, [%rd1], %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %ret = atomicrmw fadd ptr addrspace(3) %addr, float %val seq_cst
   ret float %ret
@@ -508,10 +508,10 @@ define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atomic_cmpxchg_i32_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atomic_cmpxchg_i32_param_0];
 ; CHECK-NEXT:    membar.sys;
-; CHECK-NEXT:    ld.param.u32 %r1, [atomic_cmpxchg_i32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r2, [atomic_cmpxchg_i32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [atomic_cmpxchg_i32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [atomic_cmpxchg_i32_param_2];
 ; CHECK-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -526,10 +526,10 @@ define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [atomic_cmpxchg_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [atomic_cmpxchg_i64_param_0];
 ; CHECK-NEXT:    membar.sys;
-; CHECK-NEXT:    ld.param.u64 %rd2, [atomic_cmpxchg_i64_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd3, [atomic_cmpxchg_i64_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd2, [atomic_cmpxchg_i64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd3, [atomic_cmpxchg_i64_param_2];
 ; CHECK-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/barrier.ll b/llvm/test/CodeGen/NVPTX/barrier.ll
index a8f1018c50162..05bdc9087f572 100644
--- a/llvm/test/CodeGen/NVPTX/barrier.ll
+++ b/llvm/test/CodeGen/NVPTX/barrier.ll
@@ -7,8 +7,8 @@ declare void @llvm.nvvm.barrier.sync.cnt(i32, i32)
 
 ; CHECK-LABEL: .func{{.*}}barrier_sync
 define void @barrier_sync(i32 %id, i32 %cnt) {
-  ; CHECK: ld.param.u32 	[[ID:%r[0-9]+]], [barrier_sync_param_0];
-  ; CHECK: ld.param.u32 	[[CNT:%r[0-9]+]], [barrier_sync_param_1];
+  ; CHECK: ld.param.b32 	[[ID:%r[0-9]+]], [barrier_sync_param_0];
+  ; CHECK: ld.param.b32 	[[CNT:%r[0-9]+]], [barrier_sync_param_1];
 
   ; CHECK:  barrier.sync [[ID]], [[CNT]];
   call void @llvm.nvvm.barrier.sync.cnt(i32 %id, i32 %cnt)
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 6be13c3a6fdec..1ed191fcb9ff5 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -22,10 +22,10 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
 ; SM70-NEXT:    .reg .b32 %f<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %r1, [test_fadd_param_1];
+; SM70-NEXT:    ld.param.b16 %r1, [test_fadd_param_1];
 ; SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r2;
-; SM70-NEXT:    ld.param.u16 %r3, [test_fadd_param_0];
+; SM70-NEXT:    ld.param.b16 %r3, [test_fadd_param_0];
 ; SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; SM70-NEXT:    mov.b32 %f2, %r4;
 ; SM70-NEXT:    add.rn.f32 %f3, %f2, %f1;
@@ -90,10 +90,10 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
 ; SM70-NEXT:    .reg .b32 %f<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %r1, [test_fsub_param_1];
+; SM70-NEXT:    ld.param.b16 %r1, [test_fsub_param_1];
 ; SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r2;
-; SM70-NEXT:    ld.param.u16 %r3, [test_fsub_param_0];
+; SM70-NEXT:    ld.param.b16 %r3, [test_fsub_param_0];
 ; SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; SM70-NEXT:    mov.b32 %f2, %r4;
 ; SM70-NEXT:    sub.rn.f32 %f3, %f2, %f1;
@@ -569,10 +569,10 @@ define float @test_fpext_float(bfloat %a) #0 {
 ; SM70-NEXT:    .reg .b32 %f<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %r1, [test_fpext_float_param_0];
+; SM70-NEXT:    ld.param.b16 %r1, [test_fpext_float_param_0];
 ; SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r2;
-; SM70-NEXT:    st.param.f32 [func_retval0], %f1;
+; SM70-NEXT:    st.param.b32 [func_retval0], %f1;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_fpext_float(
@@ -583,7 +583,7 @@ define float @test_fpext_float(bfloat %a) #0 {
 ; SM80-NEXT:  // %bb.0:
 ; SM80-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
 ; SM80-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM80-NEXT:    st.param.f32 [func_retval0], %f1;
+; SM80-NEXT:    st.param.b32 [func_retval0], %f1;
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_fpext_float(
@@ -594,7 +594,7 @@ define float @test_fpext_float(bfloat %a) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
 ; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f1, %rs1;
-; SM80-FTZ-NEXT:    st.param.f32 [func_retval0], %f1;
+; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %f1;
 ; SM80-FTZ-NEXT:    ret;
 ;
 ; SM90-LABEL: test_fpext_float(
@@ -605,7 +605,7 @@ define float @test_fpext_float(bfloat %a) #0 {
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
 ; SM90-NEXT:    cvt.f32.bf16 %f1, %rs1;
-; SM90-NEXT:    st.param.f32 [func_retval0], %f1;
+; SM90-NEXT:    st.param.b32 [func_retval0], %f1;
 ; SM90-NEXT:    ret;
   %r = fpext bfloat %a to float
   ret float %r
@@ -620,7 +620,7 @@ define bfloat @test_fptrunc_float(float %a) #0 {
 ; SM70-NEXT:    .reg .b32 %f<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.f32 %f1, [test_fptrunc_float_param_0];
+; SM70-NEXT:    ld.param.b32 %f1, [test_fptrunc_float_param_0];
 ; SM70-NEXT:    mov.b32 %r1, %f1;
 ; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
 ; SM70-NEXT:    add.s32 %r3, %r2, %r1;
@@ -638,7 +638,7 @@ define bfloat @test_fptrunc_float(float %a) #0 {
 ; SM80-NEXT:    .reg .b32 %f<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.f32 %f1, [test_fptrunc_float_param_0];
+; SM80-NEXT:    ld.param.b32 %f1, [test_fptrunc_float_param_0];
 ; SM80-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-NEXT:    ret;
@@ -649,7 +649,7 @@ define bfloat @test_fptrunc_float(float %a) #0 {
 ; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
-; SM80-FTZ-NEXT:    ld.param.f32 %f1, [test_fptrunc_float_param_0];
+; SM80-FTZ-NEXT:    ld.param.b32 %f1, [test_fptrunc_float_param_0];
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-FTZ-NEXT:    ret;
@@ -660,7 +660,7 @@ define bfloat @test_fptrunc_float(float %a) #0 {
 ; SM90-NEXT:    .reg .b32 %f<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.f32 %f1, [test_fptrunc_float_param_0];
+; SM90-NEXT:    ld.param.b32 %f1, [test_fptrunc_float_param_0];
 ; SM90-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
 ; SM90-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM90-NEXT:    ret;
@@ -677,7 +677,7 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
 ; SM70-NEXT:    .reg .b32 %f<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %r1, [test_fadd_imm_1_param_0];
+; SM70-NEXT:    ld.param.b16 %r1, [test_fadd_imm_1_param_0];
 ; SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r2;
 ; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
@@ -738,8 +738,8 @@ define bfloat @test_select_cc_bf16_f64(double %a, double %b, bfloat %c, bfloat %
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [test_select_cc_bf16_f64_param_0];
-; CHECK-NEXT:    ld.param.f64 %fd2, [test_select_cc_bf16_f64_param_1];
+; CHECK-NEXT:    ld.param.b64 %fd1, [test_select_cc_bf16_f64_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd2, [test_select_cc_bf16_f64_param_1];
 ; CHECK-NEXT:    setp.lt.f64 %p1, %fd1, %fd2;
 ; CHECK-NEXT:    ld.param.b16 %rs1, [test_select_cc_bf16_f64_param_2];
 ; CHECK-NEXT:    ld.param.b16 %rs2, [test_select_cc_bf16_f64_param_3];
@@ -760,7 +760,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
 ; SM70-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; SM70-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
@@ -790,8 +790,8 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM70-NEXT:    cvt.u32.u16 %r19, %rs1;
 ; SM70-NEXT:    shl.b32 %r20, %r19, 16;
 ; SM70-NEXT:    mov.b32 %f8, %r20;
-; SM70-NEXT:    st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
-; SM70-NEXT:    st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
+; SM70-NEXT:    st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5};
+; SM70-NEXT:    st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1};
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_extload_bf16x8(
@@ -802,7 +802,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM80-NEXT:    .reg .b64 %rd<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
+; SM80-NEXT:    ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
 ; SM80-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
@@ -816,8 +816,8 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM80-NEXT:    cvt.f32.bf16 %f6, %rs3;
 ; SM80-NEXT:    cvt.f32.bf16 %f7, %rs2;
 ; SM80-NEXT:    cvt.f32.bf16 %f8, %rs1;
-; SM80-NEXT:    st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
-; SM80-NEXT:    st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
+; SM80-NEXT:    st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5};
+; SM80-NEXT:    st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1};
 ; SM80-NEXT:    ret;
 ;
 ; SM80-FTZ-LABEL: test_extload_bf16x8(
@@ -828,7 +828,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM80-FTZ-NEXT:    .reg .b64 %rd<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
-; SM80-FTZ-NEXT:    ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
+; SM80-FTZ-NEXT:    ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
 ; SM80-FTZ-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; SM80-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
@@ -842,8 +842,8 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f6, %rs3;
 ; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f7, %rs2;
 ; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %f8, %rs1;
-; SM80-FTZ-NEXT:    st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
-; SM80-FTZ-NEXT:    st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
+; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5};
+; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1};
 ; SM80-FTZ-NEXT:    ret;
 ;
 ; SM90-LABEL: test_extload_bf16x8(
@@ -854,7 +854,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
 ; SM90-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; SM90-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; SM90-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
@@ -868,8 +868,8 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM90-NEXT:    cvt.f32.bf16 %f6, %rs3;
 ; SM90-NEXT:    cvt.f32.bf16 %f7, %rs2;
 ; SM90-NEXT:    cvt.f32.bf16 %f8, %rs1;
-; SM90-NEXT:    st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
-; SM90-NEXT:    st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
+; SM90-NEXT:    st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5};
+; SM90-NEXT:    st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1};
 ; SM90-NEXT:    ret;
   %load = load <8 x bfloat>, ptr addrspace(3) %arg, align 16
   %res = fpext <8 x bfloat> %load to <8 x float>
@@ -884,7 +884,7 @@ define i16 @test_fptosi_i16(bfloat %a) {
 ; SM70-NEXT:    .reg .b32 %f<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %r1, [test_fptosi_i16_param_0];
+; SM70-NEXT:    ld.param.b16 %r1, [test_fptosi_i16_param_0];
 ; SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r2;
 ; SM70-NEXT:    cvt.rzi.s16.f32 %rs1, %f1;
@@ -943,7 +943,7 @@ define i16 @test_fptoui_i16(bfloat %a) {
 ; SM70-NEXT:    .reg .b32 %f<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %r1, [test_fptoui_i16_param_0];
+; SM70-NEXT:    ld.param.b16 %r1, [test_fptoui_i16_param_0];
 ; SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r2;
 ; SM70-NEXT:    cvt.rzi.u16.f32 %rs1, %f1;
@@ -1003,7 +1003,7 @@ define bfloat @test_sitofp_i16(i16 %a) {
 ; SM70-NEXT:    .reg .b32 %f<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [test_sitofp_i16_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [test_sitofp_i16_param_0];
 ; SM70-NEXT:    cvt.rn.f32.s16 %f1, %rs1;
 ; SM70-NEXT:    mov.b32 %r1, %f1;
 ; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
@@ -1022,7 +1022,7 @@ define bfloat @test_sitofp_i16(i16 %a) {
 ; SM80-NEXT:    .reg .b32 %f<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.u16 %rs1, [test_sitofp_i16_param_0];
+; SM80-NEXT:    ld.param.b16 %rs1, [test_sitofp_i16_param_0];
 ; SM80-NEXT:    cvt.rn.f32.s16 %f1, %rs1;
 ; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
@@ -1034,7 +1034,7 @@ define bfloat @test_sitofp_i16(i16 %a) {
 ; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
-; SM80-FTZ-NEXT:    ld.param.u16 %rs1, [test_sitofp_i16_param_0];
+; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_sitofp_i16_param_0];
 ; SM80-FTZ-NEXT:    cvt.rn.f32.s16 %f1, %rs1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
@@ -1045,7 +1045,7 @@ define bfloat @test_sitofp_i16(i16 %a) {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [test_sitofp_i16_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [test_sitofp_i16_param_0];
 ; SM90-NEXT:    cvt.rn.bf16.s16 %rs2, %rs1;
 ; SM90-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM90-NEXT:    ret;
@@ -1062,7 +1062,7 @@ define bfloat @test_uitofp_i8(i8 %a) {
 ; SM70-NEXT:    .reg .b32 %f<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [test_uitofp_i8_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [test_uitofp_i8_param_0];
 ; SM70-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
 ; SM70-NEXT:    mov.b32 %r1, %f1;
 ; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
@@ -1081,7 +1081,7 @@ define bfloat @test_uitofp_i8(i8 %a) {
 ; SM80-NEXT:    .reg .b32 %f<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.u8 %rs1, [test_uitofp_i8_param_0];
+; SM80-NEXT:    ld.param.b8 %rs1, [test_uitofp_i8_param_0];
 ; SM80-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
 ; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
@@ -1093,7 +1093,7 @@ define bfloat @test_uitofp_i8(i8 %a) {
 ; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
-; SM80-FTZ-NEXT:    ld.param.u8 %rs1, [test_uitofp_i8_param_0];
+; SM80-FTZ-NEXT:    ld.param.b8 %rs1, [test_uitofp_i8_param_0];
 ; SM80-FTZ-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
@@ -1104,7 +1104,7 @@ define bfloat @test_uitofp_i8(i8 %a) {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [test_uitofp_i8_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [test_uitofp_i8_param_0];
 ; SM90-NEXT:    cvt.rn.bf16.u16 %rs2, %rs1;
 ; SM90-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM90-NEXT:    ret;
@@ -1121,7 +1121,7 @@ define bfloat @test_uitofp_i1(i1 %a) {
 ; SM70-NEXT:    .reg .b32 %f<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [test_uitofp_i1_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [test_uitofp_i1_param_0];
 ; SM70-NEXT:    and.b16 %rs2, %rs1, 1;
 ; SM70-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; SM70-NEXT:    selp.b32 %r1, 1, 0, %p1;
@@ -1145,7 +1145,7 @@ define bfloat @test_uitofp_i1(i1 %a) {
 ; SM80-NEXT:    .reg .b32 %f<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.u8 %rs1, [test_uitofp_i1_param_0];
+; SM80-NEXT:    ld.param.b8 %rs1, [test_uitofp_i1_param_0];
 ; SM80-NEXT:    and.b16 %rs2, %rs1, 1;
 ; SM80-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; SM80-NEXT:    selp.b32 %r1, 1, 0, %p1;
@@ -1162,7 +1162,7 @@ define bfloat @test_uitofp_i1(i1 %a) {
 ; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
-; SM80-FTZ-NEXT:    ld.param.u8 %rs1, [test_uitofp_i1_param_0];
+; SM80-FTZ-NEXT:    ld.param.b8 %rs1, [test_uitofp_i1_param_0];
 ; SM80-FTZ-NEXT:    and.b16 %rs2, %rs1, 1;
 ; SM80-FTZ-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; SM80-FTZ-NEXT:    selp.b32 %r1, 1, 0, %p1;
@@ -1178,7 +1178,7 @@ define bfloat @test_uitofp_i1(i1 %a) {
 ; SM90-NEXT:    .reg .b32 %r<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [test_uitofp_i1_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [test_uitofp_i1_param_0];
 ; SM90-NEXT:    and.b16 %rs2, %rs1, 1;
 ; SM90-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; SM90-NEXT:    selp.b32 %r1, 1, 0, %p1;
@@ -1198,7 +1198,7 @@ define bfloat @test_uitofp_i16(i16 %a) {
 ; SM70-NEXT:    .reg .b32 %f<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [test_uitofp_i16_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [test_uitofp_i16_param_0];
 ; SM70-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
 ; SM70-NEXT:    mov.b32 %r1, %f1;
 ; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
@@ -1217,7 +1217,7 @@ define bfloat @test_uitofp_i16(i16 %a) {
 ; SM80-NEXT:    .reg .b32 %f<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.u16 %rs1, [test_uitofp_i16_param_0];
+; SM80-NEXT:    ld.param.b16 %rs1, [test_uitofp_i16_param_0];
 ; SM80-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
 ; SM80-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs2;
@@ -1229,7 +1229,7 @@ define bfloat @test_uitofp_i16(i16 %a) {
 ; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
-; SM80-FTZ-NEXT:    ld.param.u16 %rs1, [test_uitofp_i16_param_0];
+; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_uitofp_i16_param_0];
 ; SM80-FTZ-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %f1;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
@@ -1240,7 +1240,7 @@ define bfloat @test_uitofp_i16(i16 %a) {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [test_uitofp_i16_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [test_uitofp_i16_param_0];
 ; SM90-NEXT:    cvt.rn.bf16.u16 %rs2, %rs1;
 ; SM90-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM90-NEXT:    ret;
@@ -1257,7 +1257,7 @@ define bfloat @test_uitofp_i32(i32 %a) {
 ; SM70-NEXT:    .reg .b32 %f<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u32 %r1, [test_uitofp_i32_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [test_uitofp_i32_param_0];
 ; SM70-NEXT:    cvt.rn.f32.u32 %f1, %r1;
 ; SM70-NEXT:    mov.b32 %r2, %f1;
 ; SM70-NEXT:    bfe.u32 %r3, %r2, 16, 1;
@@ -1277,7 +1277,7 @@ define bfloat @test_uitofp_i32(i32 %a) {
 ; SM80-NEXT:    .reg .b32 %f<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.u32 %r1, [test_uitofp_i32_param_0];
+; SM80-NEXT:    ld.param.b32 %r1, [test_uitofp_i32_param_0];
 ; SM80-NEXT:    cvt.rn.f32.u32 %f1, %r1;
 ; SM80-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs1;
@@ -1290,7 +1290,7 @@ define bfloat @test_uitofp_i32(i32 %a) {
 ; SM80-FTZ-NEXT:    .reg .b32 %f<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
-; SM80-FTZ-NEXT:    ld.param.u32 %r1, [test_uitofp_i32_param_0];
+; SM80-FTZ-NEXT:    ld.param.b32 %r1, [test_uitofp_i32_param_0];
 ; SM80-FTZ-NEXT:    cvt.rn.f32.u32 %f1, %r1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
@@ -1302,7 +1302,7 @@ define bfloat @test_uitofp_i32(i32 %a) {
 ; SM90-NEXT:    .reg .b32 %r<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u32 %r1, [test_uitofp_i32_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [test_uitofp_i32_param_0];
 ; SM90-NEXT:    cvt.rn.bf16.u32 %rs1, %r1;
 ; SM90-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM90-NEXT:    ret;
@@ -1320,7 +1320,7 @@ define bfloat @test_uitofp_i64(i64 %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [test_uitofp_i64_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [test_uitofp_i64_param_0];
 ; SM70-NEXT:    cvt.rn.f32.u64 %f1, %rd1;
 ; SM70-NEXT:    mov.b32 %r1, %f1;
 ; SM70-NEXT:    bfe.u32 %r2, %r1, 16, 1;
@@ -1340,7 +1340,7 @@ define bfloat @test_uitofp_i64(i64 %a) {
 ; SM80-NEXT:    .reg .b64 %rd<2>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.u64 %rd1, [test_uitofp_i64_param_0];
+; SM80-NEXT:    ld.param.b64 %rd1, [test_uitofp_i64_param_0];
 ; SM80-NEXT:    cvt.rn.f32.u64 %f1, %rd1;
 ; SM80-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
 ; SM80-NEXT:    st.param.b16 [func_retval0], %rs1;
@@ -1353,7 +1353,7 @@ define bfloat @test_uitofp_i64(i64 %a) {
 ; SM80-FTZ-NEXT:    .reg .b64 %rd<2>;
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
-; SM80-FTZ-NEXT:    ld.param.u64 %rd1, [test_uitofp_i64_param_0];
+; SM80-FTZ-NEXT:    ld.param.b64 %rd1, [test_uitofp_i64_param_0];
 ; SM80-FTZ-NEXT:    cvt.rn.f32.u64 %f1, %rd1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
@@ -1365,7 +1365,7 @@ define bfloat @test_uitofp_i64(i64 %a) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [test_uitofp_i64_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [test_uitofp_i64_param_0];
 ; SM90-NEXT:    cvt.rn.bf16.u64 %rs1, %rd1;
 ; SM90-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM90-NEXT:    ret;
@@ -1382,7 +1382,7 @@ define bfloat @test_roundeven(bfloat %a) {
 ; SM70-NEXT:    .reg .b32 %f<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %r1, [test_roundeven_param_0];
+; SM70-NEXT:    ld.param.b16 %r1, [test_roundeven_param_0];
 ; SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r2;
 ; SM70-NEXT:    cvt.rni.f32.f32 %f2, %f1;
@@ -1514,10 +1514,10 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
 ; SM70-NEXT:    .reg .b32 %f<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %r1, [test_maxnum_param_1];
+; SM70-NEXT:    ld.param.b16 %r1, [test_maxnum_param_1];
 ; SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r2;
-; SM70-NEXT:    ld.param.u16 %r3, [test_maxnum_param_0];
+; SM70-NEXT:    ld.param.b16 %r3, [test_maxnum_param_0];
 ; SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; SM70-NEXT:    mov.b32 %f2, %r4;
 ; SM70-NEXT:    max.f32 %f3, %f2, %f1;
diff --git a/llvm/test/CodeGen/NVPTX/bf16.ll b/llvm/test/CodeGen/NVPTX/bf16.ll
index 98fdbbbdd9c75..059736751f61e 100644
--- a/llvm/test/CodeGen/NVPTX/bf16.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16.ll
@@ -26,8 +26,8 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou
 
 define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; CHECK-LABEL: @test_bitcast_to_bfloat
-; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
-; CHECK: st.global.u16 [{{%rd[0-9]+}}], [[TMP]]
+; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
+; CHECK: st.global.b16 [{{%rd[0-9]+}}], [[TMP]]
   %val = load i16, ptr addrspace(1) %in
   %val_fp = bitcast i16 %val to bfloat
   store bfloat %val_fp, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 677f0d795dde8..cd73b78eff97c 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -157,7 +157,7 @@ define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_fneg_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fneg_param_0];
 ; CHECK-NEXT:    xor.b32 %r2, %r1, -2147450880;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -172,9 +172,9 @@ define void @test_ldst_v2bf16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v2bf16_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v2bf16_param_0];
 ; CHECK-NEXT:    ld.b32 %r1, [%rd1];
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v2bf16_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v2bf16_param_1];
 ; CHECK-NEXT:    st.b32 [%rd2], %r1;
 ; CHECK-NEXT:    ret;
   %t1 = load <2 x bfloat>, ptr %a
@@ -190,11 +190,11 @@ define void @test_ldst_v3bf16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v3bf16_param_0];
-; CHECK-NEXT:    ld.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v3bf16_param_0];
+; CHECK-NEXT:    ld.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    mov.b64 {_, %r1}, %rd2;
-; CHECK-NEXT:    ld.param.u64 %rd3, [test_ldst_v3bf16_param_1];
-; CHECK-NEXT:    st.u32 [%rd3], %rd2;
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_ldst_v3bf16_param_1];
+; CHECK-NEXT:    st.b32 [%rd3], %rd2;
 ; CHECK-NEXT:    mov.b32 {%rs1, _}, %r1;
 ; CHECK-NEXT:    st.b16 [%rd3+4], %rs1;
 ; CHECK-NEXT:    ret;
@@ -241,7 +241,7 @@ define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [test_select_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_select_param_2];
 ; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_select_param_1];
@@ -315,7 +315,7 @@ define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b,
 ; SM80-NEXT:    .reg .b32 %f<11>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
-; SM80-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
+; SM80-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
 ; SM80-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_bf16_param_2];
 ; SM80-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_bf16_param_3];
 ; SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
@@ -326,10 +326,10 @@ define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b,
 ; SM80-NEXT:    cvt.f32.bf16 %f5, %rs2;
 ; SM80-NEXT:    cvt.f32.bf16 %f6, %rs4;
 ; SM80-NEXT:    setp.neu.f32 %p2, %f6, %f5;
-; SM80-NEXT:    ld.param.v2.f32 {%f7, %f8}, [test_select_cc_f32_bf16_param_1];
+; SM80-NEXT:    ld.param.v2.b32 {%f7, %f8}, [test_select_cc_f32_bf16_param_1];
 ; SM80-NEXT:    selp.f32 %f9, %f2, %f8, %p2;
 ; SM80-NEXT:    selp.f32 %f10, %f1, %f7, %p1;
-; SM80-NEXT:    st.param.v2.f32 [func_retval0], {%f10, %f9};
+; SM80-NEXT:    st.param.v2.b32 [func_retval0], {%f10, %f9};
 ; SM80-NEXT:    ret;
 ;
 ; SM90-LABEL: test_select_cc_f32_bf16(
@@ -339,14 +339,14 @@ define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b,
 ; SM90-NEXT:    .reg .b32 %f<7>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
+; SM90-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
 ; SM90-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_bf16_param_3];
 ; SM90-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_bf16_param_2];
 ; SM90-NEXT:    setp.neu.bf16x2 %p1|%p2, %r2, %r1;
-; SM90-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_bf16_param_1];
+; SM90-NEXT:    ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f32_bf16_param_1];
 ; SM90-NEXT:    selp.f32 %f5, %f2, %f4, %p2;
 ; SM90-NEXT:    selp.f32 %f6, %f1, %f3, %p1;
-; SM90-NEXT:    st.param.v2.f32 [func_retval0], {%f6, %f5};
+; SM90-NEXT:    st.param.v2.b32 [func_retval0], {%f6, %f5};
 ; SM90-NEXT:    ret;
                                            <2 x bfloat> %c, <2 x bfloat> %d) #0 {
   %cc = fcmp une <2 x bfloat> %c, %d
@@ -365,8 +365,8 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_bf16_f32_param_0];
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_bf16_f32_param_1];
-; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_bf16_f32_param_2];
-; CHECK-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_select_cc_bf16_f32_param_3];
+; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_select_cc_bf16_f32_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%f3, %f4}, [test_select_cc_bf16_f32_param_3];
 ; CHECK-NEXT:    setp.neu.f32 %p1, %f1, %f3;
 ; CHECK-NEXT:    setp.neu.f32 %p2, %f2, %f4;
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
@@ -389,7 +389,7 @@ define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
 ; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r1, %f2, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -409,7 +409,7 @@ define <2 x float> @test_fpext_2xfloat(<2 x bfloat> %a) #0 {
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f32.bf16 %f1, %rs2;
 ; CHECK-NEXT:    cvt.f32.bf16 %f2, %rs1;
-; CHECK-NEXT:    st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%f2, %f1};
 ; CHECK-NEXT:    ret;
   %r = fpext <2 x bfloat> %a to <2 x float>
   ret <2 x float> %r
@@ -421,7 +421,7 @@ define <2 x i16> @test_bitcast_2xbf16_to_2xi16(<2 x bfloat> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_2xbf16_to_2xi16_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_2xbf16_to_2xi16_param_0];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %r = bitcast <2 x bfloat> %a to <2 x i16>
@@ -507,7 +507,7 @@ define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_fabs_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fabs_param_0];
 ; CHECK-NEXT:    and.b32 %r2, %r1, 2147450879;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/bfe.ll b/llvm/test/CodeGen/NVPTX/bfe.ll
index 0392f7786731a..644bf3606e8f6 100644
--- a/llvm/test/CodeGen/NVPTX/bfe.ll
+++ b/llvm/test/CodeGen/NVPTX/bfe.ll
@@ -12,7 +12,7 @@ define i32 @bfe0(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [bfe0_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [bfe0_param_0];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 4, 4;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -27,7 +27,7 @@ define i32 @bfe1(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [bfe1_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [bfe1_param_0];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 3, 3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -42,7 +42,7 @@ define i32 @bfe2(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [bfe2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [bfe2_param_0];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 5, 3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -57,7 +57,7 @@ define i32 @no_bfe_on_32bit_overflow(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [no_bfe_on_32bit_overflow_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [no_bfe_on_32bit_overflow_param_0];
 ; CHECK-NEXT:    shr.s32 %r2, %r1, 31;
 ; CHECK-NEXT:    and.b32 %r3, %r2, 15;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -73,7 +73,7 @@ define i32 @no_bfe_on_32bit_overflow_shr_and_pair(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [no_bfe_on_32bit_overflow_shr_and_pair_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [no_bfe_on_32bit_overflow_shr_and_pair_param_0];
 ; CHECK-NEXT:    shr.s32 %r2, %r1, 31;
 ; CHECK-NEXT:    and.b32 %r3, %r2, 15;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -89,7 +89,7 @@ define i64 @no_bfe_on_64bit_overflow(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [no_bfe_on_64bit_overflow_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [no_bfe_on_64bit_overflow_param_0];
 ; CHECK-NEXT:    shr.s64 %rd2, %rd1, 63;
 ; CHECK-NEXT:    and.b64 %rd3, %rd2, 7;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
@@ -105,7 +105,7 @@ define i64 @no_bfe_on_64bit_overflow_shr_and_pair(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [no_bfe_on_64bit_overflow_shr_and_pair_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [no_bfe_on_64bit_overflow_shr_and_pair_param_0];
 ; CHECK-NEXT:    shr.s64 %rd2, %rd1, 63;
 ; CHECK-NEXT:    and.b64 %rd3, %rd2, 7;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
@@ -121,7 +121,7 @@ define i32 @bfe_ashr_signed_32(i32 %x) {
 ; CHECK-O3-NEXT:    .reg .b32 %r<3>;
 ; CHECK-O3-EMPTY:
 ; CHECK-O3-NEXT:  // %bb.0:
-; CHECK-O3-NEXT:    ld.param.u16 %r1, [bfe_ashr_signed_32_param_0+2];
+; CHECK-O3-NEXT:    ld.param.b16 %r1, [bfe_ashr_signed_32_param_0+2];
 ; CHECK-O3-NEXT:    bfe.s32 %r2, %r1, 4, 12;
 ; CHECK-O3-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-O3-NEXT:    ret;
@@ -131,7 +131,7 @@ define i32 @bfe_ashr_signed_32(i32 %x) {
 ; CHECK-O0-NEXT:    .reg .b32 %r<3>;
 ; CHECK-O0-EMPTY:
 ; CHECK-O0-NEXT:  // %bb.0:
-; CHECK-O0-NEXT:    ld.param.u32 %r1, [bfe_ashr_signed_32_param_0];
+; CHECK-O0-NEXT:    ld.param.b32 %r1, [bfe_ashr_signed_32_param_0];
 ; CHECK-O0-NEXT:    bfe.s32 %r2, %r1, 20, 12;
 ; CHECK-O0-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-O0-NEXT:    ret;
@@ -146,7 +146,7 @@ define i32 @bfe_ashr_unsigned_32(i32 %x) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [bfe_ashr_unsigned_32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [bfe_ashr_unsigned_32_param_0];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 5, 6;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -161,7 +161,7 @@ define i64 @bfe_ashr_signed_64(i64 %x) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [bfe_ashr_signed_64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [bfe_ashr_signed_64_param_0];
 ; CHECK-NEXT:    bfe.s64 %rd2, %rd1, 16, 48;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
@@ -176,7 +176,7 @@ define i64 @bfe_ashr_unsigned_64(i64 %x) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [bfe_ashr_unsigned_64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [bfe_ashr_unsigned_64_param_0];
 ; CHECK-NEXT:    bfe.u64 %rd2, %rd1, 5, 6;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
@@ -192,7 +192,7 @@ define i32 @bfe3(i128 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [bfe3_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [bfe3_param_0];
 ; CHECK-NEXT:    cvt.u32.u64 %r1, %rd1;
 ; CHECK-NEXT:    bfe.s32 %r2, %r1, 15, 17;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
@@ -209,7 +209,7 @@ define i64 @bfe4(i128 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [bfe4_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [bfe4_param_0];
 ; CHECK-NEXT:    bfe.s64 %rd3, %rd1, 17, 47;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
index 0054225e6d6e6..0e16682641edd 100644
--- a/llvm/test/CodeGen/NVPTX/bswap.ll
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -14,7 +14,7 @@ define i16 @bswap16(i16 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [bswap16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [bswap16_param_0];
 ; CHECK-NEXT:    shr.u16 %rs2, %rs1, 8;
 ; CHECK-NEXT:    shl.b16 %rs3, %rs1, 8;
 ; CHECK-NEXT:    or.b16 %rs4, %rs3, %rs2;
@@ -32,7 +32,7 @@ define i32 @bswap32(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [bswap32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [bswap32_param_0];
 ; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 291;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -47,7 +47,7 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [bswapv2i16_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [bswapv2i16_param_0];
 ; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 8961;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -62,7 +62,7 @@ define i64 @bswap64(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [bswap64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [bswap64_param_0];
 ; PTX70-NEXT:    { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
 ; PTX70-NEXT:    prmt.b32 %r2, %r1, 0, 291;
 ; PTX70-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
diff --git a/llvm/test/CodeGen/NVPTX/bug21465.ll b/llvm/test/CodeGen/NVPTX/bug21465.ll
index 33c6dbddd5297..79b0dbcf6494c 100644
--- a/llvm/test/CodeGen/NVPTX/bug21465.ll
+++ b/llvm/test/CodeGen/NVPTX/bug21465.ll
@@ -15,10 +15,10 @@ entry:
 ; CHECK: call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr %input)
   %b = getelementptr inbounds %struct.S, ptr %input, i64 0, i32 1
   %0 = load i32, ptr %b, align 4
-; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}]
-; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4]
+; PTX-NOT: ld.param.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}]
+; PTX: ld.param.b32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4]
   store i32 %0, ptr %output, align 4
-; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]]
+; PTX-NEXT: st.global.b32 [{{%rd[0-9]+}}], [[value]]
   ret void
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/bug22246.ll b/llvm/test/CodeGen/NVPTX/bug22246.ll
index 0080aafcf5631..198878c1b96ff 100644
--- a/llvm/test/CodeGen/NVPTX/bug22246.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22246.ll
@@ -13,15 +13,15 @@ define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, ptr noc
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u8 %rs1, [_Z3foobbbPb_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [_Z3foobbbPb_param_0];
 ; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT:    ld.param.u8 %rs3, [_Z3foobbbPb_param_1];
-; CHECK-NEXT:    ld.param.u8 %rs4, [_Z3foobbbPb_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs3, [_Z3foobbbPb_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs4, [_Z3foobbbPb_param_2];
 ; CHECK-NEXT:    selp.b16 %rs5, %rs3, %rs4, %p1;
 ; CHECK-NEXT:    and.b16 %rs6, %rs5, 1;
-; CHECK-NEXT:    ld.param.u64 %rd1, [_Z3foobbbPb_param_3];
-; CHECK-NEXT:    st.u8 [%rd1], %rs6;
+; CHECK-NEXT:    ld.param.b64 %rd1, [_Z3foobbbPb_param_3];
+; CHECK-NEXT:    st.b8 [%rd1], %rs6;
 ; CHECK-NEXT:    ret;
 entry:
   %.sink.v = select i1 %p1, i1 %p2, i1 %p3
diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
index c0bbf5b3559bb..c4d1537557cad 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
@@ -15,7 +15,7 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p
 bb:
   %tmp5 = add nsw i64 %arg3, 8
   %tmp6 = getelementptr i16, ptr addrspace(1) %arg, i64 %tmp5
-; CHECK: ld.global.nc.u16
+; CHECK: ld.global.nc.b16
   %tmp7 = load i16, ptr addrspace(1) %tmp6, align 2
 ; CHECK: cvt.s32.s16
   %tmp8 = sext i16 %tmp7 to i64
diff --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll
index 193df7f86ca72..3b30ce560edbc 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185.ll
@@ -10,7 +10,7 @@ target triple = "nvptx64-unknown-unknown"
 ; CHECK-LABEL: ex_zext
 define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
 entry:
-; CHECK: ld.global.nc.u8
+; CHECK: ld.global.nc.b8
   %val = load i8, ptr %data
 ; CHECK: cvt.u32.u8
   %valext = zext i8 %val to i32
@@ -21,7 +21,7 @@ entry:
 ; CHECK-LABEL: ex_sext
 define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
 entry:
-; CHECK: ld.global.nc.u8
+; CHECK: ld.global.nc.b8
   %val = load i8, ptr %data
 ; CHECK: cvt.s32.s8
   %valext = sext i8 %val to i32
@@ -32,7 +32,7 @@ entry:
 ; CHECK-LABEL: ex_zext_v2
 define ptx_kernel void @ex_zext_v2(ptr noalias readonly %data, ptr %res) {
 entry:
-; CHECK: ld.global.nc.v2.u8
+; CHECK: ld.global.nc.v2.b8
   %val = load <2 x i8>, ptr %data
 ; CHECK: cvt.u32.u16
   %valext = zext <2 x i8> %val to <2 x i32>
@@ -43,7 +43,7 @@ entry:
 ; CHECK-LABEL: ex_sext_v2
 define ptx_kernel void @ex_sext_v2(ptr noalias readonly %data, ptr %res) {
 entry:
-; CHECK: ld.global.nc.v2.u8
+; CHECK: ld.global.nc.v2.b8
   %val = load <2 x i8>, ptr %data
 ; CHECK: cvt.s32.s8
   %valext = sext <2 x i8> %val to <2 x i32>
diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index 9474b01f95ee8..c4a62f9f8c508 100644
--- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -23,11 +23,11 @@ entry:
 ; CHECK: .local .align 4 .b8 	__local_depot0[16]
 ; CHECK: mov.b64 %SPL
 
-; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]
+; CHECK: ld.param.b64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]
 ; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]]
 ; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0
-; CHECK: ld.global.f32 %f[[A0_REG:[0-9]+]], [%rd[[A1_REG]]]
-; CHECK: st.local.f32 [{{%rd[0-9]+}}], %f[[A0_REG]]
+; CHECK: ld.global.b32 %f[[A0_REG:[0-9]+]], [%rd[[A1_REG]]]
+; CHECK: st.local.b32 [{{%rd[0-9]+}}], %f[[A0_REG]]
 
   %0 = load float, ptr %a, align 4
   store float %0, ptr %buf, align 4
diff --git a/llvm/test/CodeGen/NVPTX/chain-different-as.ll b/llvm/test/CodeGen/NVPTX/chain-different-as.ll
index 704ed234f7fe6..f2d0d9d069ea6 100644
--- a/llvm/test/CodeGen/NVPTX/chain-different-as.ll
+++ b/llvm/test/CodeGen/NVPTX/chain-different-as.ll
@@ -9,8 +9,8 @@ define i64 @test() nounwind readnone {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    mov.b64 %rd1, 1;
 ; CHECK-NEXT:    mov.b64 %rd2, 42;
-; CHECK-NEXT:    st.u64 [%rd1], %rd2;
-; CHECK-NEXT:    ld.global.u64 %rd3, [%rd1];
+; CHECK-NEXT:    st.b64 [%rd1], %rd2;
+; CHECK-NEXT:    ld.global.b64 %rd3, [%rd1];
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
   %addr0 = inttoptr i64 1 to ptr
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
index 442da4debea8f..65a077d67e4ba 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
@@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -23,9 +23,9 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -68,9 +68,9 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -113,9 +113,9 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -158,9 +158,9 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -192,8 +192,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -204,9 +204,9 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -238,8 +238,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -250,9 +250,9 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -284,8 +284,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -297,9 +297,9 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -331,8 +331,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -344,9 +344,9 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -378,8 +378,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -391,9 +391,9 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -425,8 +425,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -437,9 +437,9 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -471,8 +471,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -483,9 +483,9 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -517,8 +517,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -529,9 +529,9 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -563,8 +563,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -575,9 +575,9 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -609,8 +609,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -621,9 +621,9 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -655,8 +655,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM60-NEXT:    and.b32 %r10, %r9, 3;
@@ -667,9 +667,9 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -701,8 +701,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -714,9 +714,9 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -748,8 +748,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -761,9 +761,9 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -795,8 +795,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -808,9 +808,9 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -842,8 +842,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -855,9 +855,9 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -888,8 +888,8 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -901,9 +901,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -934,8 +934,8 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -947,9 +947,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -980,8 +980,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -993,9 +993,9 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [release_acquire_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1027,8 +1027,8 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1040,9 +1040,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [release_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1074,8 +1074,8 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1087,9 +1087,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [release_acquire_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1121,8 +1121,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1134,9 +1134,9 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1168,8 +1168,8 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1181,9 +1181,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1215,8 +1215,8 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1228,9 +1228,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1262,8 +1262,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1275,9 +1275,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1309,8 +1309,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1322,9 +1322,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1356,8 +1356,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1369,9 +1369,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1403,8 +1403,8 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1416,9 +1416,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1450,8 +1450,8 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1463,9 +1463,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB31_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1497,8 +1497,8 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1510,9 +1510,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1544,8 +1544,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1557,9 +1557,9 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1591,8 +1591,8 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1604,9 +1604,9 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1638,8 +1638,8 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1651,9 +1651,9 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB35_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1685,8 +1685,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1698,9 +1698,9 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB36_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1732,8 +1732,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1745,9 +1745,9 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1779,8 +1779,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1792,9 +1792,9 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1826,8 +1826,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1839,9 +1839,9 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1873,8 +1873,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1886,9 +1886,9 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB40_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1920,8 +1920,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1933,9 +1933,9 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1967,8 +1967,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1980,9 +1980,9 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2014,8 +2014,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -2027,9 +2027,9 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2061,8 +2061,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
+; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -2074,9 +2074,9 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM60-NEXT:    and.b32 %r14, %r13, 255;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
 ; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB44_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2108,10 +2108,10 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2121,7 +2121,7 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB45_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2152,10 +2152,10 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2165,7 +2165,7 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB46_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2196,10 +2196,10 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2209,7 +2209,7 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB47_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2240,10 +2240,10 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2253,7 +2253,7 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB48_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2285,10 +2285,10 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2298,7 +2298,7 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB49_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2330,10 +2330,10 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2343,7 +2343,7 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB50_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2375,10 +2375,10 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -2389,7 +2389,7 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB51_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2421,10 +2421,10 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -2435,7 +2435,7 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB52_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2467,10 +2467,10 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -2481,7 +2481,7 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB53_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2513,10 +2513,10 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2526,7 +2526,7 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB54_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2558,10 +2558,10 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2571,7 +2571,7 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB55_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2603,10 +2603,10 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2616,7 +2616,7 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB56_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2648,10 +2648,10 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2661,7 +2661,7 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB57_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2693,10 +2693,10 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2706,7 +2706,7 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB58_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2738,10 +2738,10 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1];
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
 ; SM60-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2751,7 +2751,7 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB59_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2783,10 +2783,10 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -2797,7 +2797,7 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2829,10 +2829,10 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -2843,7 +2843,7 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB61_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2875,10 +2875,10 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -2889,7 +2889,7 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB62_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2921,10 +2921,10 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -2935,7 +2935,7 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB63_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2966,10 +2966,10 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -2980,7 +2980,7 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3011,10 +3011,10 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3025,7 +3025,7 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3056,10 +3056,10 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [release_acquire_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3070,7 +3070,7 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3102,10 +3102,10 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [release_acquire_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3116,7 +3116,7 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB67_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3148,10 +3148,10 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [release_acquire_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3162,7 +3162,7 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3194,10 +3194,10 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3208,7 +3208,7 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3240,10 +3240,10 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3254,7 +3254,7 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3286,10 +3286,10 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3300,7 +3300,7 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3332,10 +3332,10 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3346,7 +3346,7 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3378,10 +3378,10 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3392,7 +3392,7 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3424,10 +3424,10 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3438,7 +3438,7 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3470,10 +3470,10 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3484,7 +3484,7 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3516,10 +3516,10 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3530,7 +3530,7 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3562,10 +3562,10 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3576,7 +3576,7 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3608,10 +3608,10 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3622,7 +3622,7 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3654,10 +3654,10 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3668,7 +3668,7 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3700,10 +3700,10 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3714,7 +3714,7 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3746,10 +3746,10 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3760,7 +3760,7 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3792,10 +3792,10 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3806,7 +3806,7 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3838,10 +3838,10 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3852,7 +3852,7 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3884,10 +3884,10 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3898,7 +3898,7 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3930,10 +3930,10 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3944,7 +3944,7 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3976,10 +3976,10 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -3990,7 +3990,7 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -4022,10 +4022,10 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -4036,7 +4036,7 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -4068,10 +4068,10 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -4082,7 +4082,7 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -4114,10 +4114,10 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
+; SM60-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM60-NEXT:    and.b32 %r11, %r10, 3;
@@ -4128,7 +4128,7 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM60-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM60-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -4158,9 +4158,9 @@ define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4175,9 +4175,9 @@ define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4192,9 +4192,9 @@ define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4209,9 +4209,9 @@ define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4226,9 +4226,9 @@ define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4243,9 +4243,9 @@ define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4260,10 +4260,10 @@ define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4278,10 +4278,10 @@ define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4296,10 +4296,10 @@ define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4314,9 +4314,9 @@ define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4331,9 +4331,9 @@ define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4348,9 +4348,9 @@ define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4365,9 +4365,9 @@ define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4382,9 +4382,9 @@ define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4399,9 +4399,9 @@ define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4416,10 +4416,10 @@ define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4434,10 +4434,10 @@ define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4452,10 +4452,10 @@ define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4470,9 +4470,9 @@ define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4487,9 +4487,9 @@ define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_global_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4504,9 +4504,9 @@ define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4521,9 +4521,9 @@ define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_generic_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [release_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [release_acquire_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4538,9 +4538,9 @@ define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [release_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [release_acquire_i32_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4555,9 +4555,9 @@ define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_shared_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [release_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [release_acquire_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4572,10 +4572,10 @@ define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4590,10 +4590,10 @@ define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_global_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4608,10 +4608,10 @@ define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4626,9 +4626,9 @@ define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4643,9 +4643,9 @@ define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4660,9 +4660,9 @@ define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4677,9 +4677,9 @@ define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4694,9 +4694,9 @@ define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4711,9 +4711,9 @@ define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0];
-; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4728,10 +4728,10 @@ define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4746,10 +4746,10 @@ define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4764,10 +4764,10 @@ define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4782,10 +4782,10 @@ define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4800,10 +4800,10 @@ define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4818,10 +4818,10 @@ define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4836,10 +4836,10 @@ define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4854,10 +4854,10 @@ define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4872,10 +4872,10 @@ define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4890,10 +4890,10 @@ define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
 ; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4908,10 +4908,10 @@ define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4926,10 +4926,10 @@ define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
-; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
+; SM60-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
+; SM60-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM60-NEXT:    ret;
@@ -4943,9 +4943,9 @@ define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -4959,9 +4959,9 @@ define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -4975,9 +4975,9 @@ define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -4991,9 +4991,9 @@ define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5007,9 +5007,9 @@ define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5023,9 +5023,9 @@ define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5039,10 +5039,10 @@ define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5056,10 +5056,10 @@ define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5073,10 +5073,10 @@ define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5090,9 +5090,9 @@ define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5106,9 +5106,9 @@ define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5122,9 +5122,9 @@ define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5138,9 +5138,9 @@ define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5154,9 +5154,9 @@ define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5170,9 +5170,9 @@ define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5186,10 +5186,10 @@ define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5203,10 +5203,10 @@ define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5220,10 +5220,10 @@ define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5237,9 +5237,9 @@ define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5253,9 +5253,9 @@ define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_global_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5269,9 +5269,9 @@ define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5285,9 +5285,9 @@ define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_generic_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5301,9 +5301,9 @@ define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_global_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5317,9 +5317,9 @@ define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_shared_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5333,10 +5333,10 @@ define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5350,10 +5350,10 @@ define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5367,10 +5367,10 @@ define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5384,9 +5384,9 @@ define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5400,9 +5400,9 @@ define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5416,9 +5416,9 @@ define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5432,9 +5432,9 @@ define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5448,9 +5448,9 @@ define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5464,9 +5464,9 @@ define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0];
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5480,10 +5480,10 @@ define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5497,10 +5497,10 @@ define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5514,10 +5514,10 @@ define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5531,10 +5531,10 @@ define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5548,10 +5548,10 @@ define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5565,10 +5565,10 @@ define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5582,10 +5582,10 @@ define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5599,10 +5599,10 @@ define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5616,10 +5616,10 @@ define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5633,10 +5633,10 @@ define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
 ; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5650,10 +5650,10 @@ define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
 ; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
@@ -5667,10 +5667,10 @@ define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM60-NEXT:    .reg .b64 %rd<5>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
+; SM60-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
-; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
+; SM60-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
 ; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM60-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
index df8c49aaaa42c..7107fbcf6eb54 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
@@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -23,9 +23,9 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -68,9 +68,9 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -113,9 +113,9 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -158,9 +158,9 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -192,8 +192,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -204,9 +204,9 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -238,8 +238,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -250,9 +250,9 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -284,8 +284,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -297,9 +297,9 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -331,8 +331,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -344,9 +344,9 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -378,8 +378,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -391,9 +391,9 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -425,8 +425,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -437,9 +437,9 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -471,8 +471,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -483,9 +483,9 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -517,8 +517,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -529,9 +529,9 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -563,8 +563,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -575,9 +575,9 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -609,8 +609,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -621,9 +621,9 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -655,8 +655,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -667,9 +667,9 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -701,8 +701,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -714,9 +714,9 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -748,8 +748,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -761,9 +761,9 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -795,8 +795,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -808,9 +808,9 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -842,8 +842,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -855,9 +855,9 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -888,8 +888,8 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -901,9 +901,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -934,8 +934,8 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -947,9 +947,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -980,8 +980,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -993,9 +993,9 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [release_acquire_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1027,8 +1027,8 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1040,9 +1040,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [release_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1074,8 +1074,8 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1087,9 +1087,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [release_acquire_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1121,8 +1121,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1134,9 +1134,9 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1168,8 +1168,8 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1181,9 +1181,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1215,8 +1215,8 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1228,9 +1228,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1262,8 +1262,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1275,9 +1275,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1309,8 +1309,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1322,9 +1322,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1356,8 +1356,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1369,9 +1369,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1403,8 +1403,8 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1416,9 +1416,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1450,8 +1450,8 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1463,9 +1463,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB31_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1497,8 +1497,8 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1510,9 +1510,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1544,8 +1544,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1557,9 +1557,9 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1591,8 +1591,8 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1604,9 +1604,9 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1638,8 +1638,8 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1651,9 +1651,9 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB35_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1685,8 +1685,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1698,9 +1698,9 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB36_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1732,8 +1732,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1745,9 +1745,9 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1779,8 +1779,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1792,9 +1792,9 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1826,8 +1826,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1839,9 +1839,9 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1873,8 +1873,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1886,9 +1886,9 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB40_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1920,8 +1920,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1933,9 +1933,9 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1967,8 +1967,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1980,9 +1980,9 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2014,8 +2014,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -2027,9 +2027,9 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2061,8 +2061,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -2074,9 +2074,9 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB44_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2108,10 +2108,10 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2121,7 +2121,7 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB45_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2152,10 +2152,10 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2165,7 +2165,7 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB46_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2196,10 +2196,10 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2209,7 +2209,7 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB47_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2240,10 +2240,10 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2253,7 +2253,7 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB48_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2285,10 +2285,10 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2298,7 +2298,7 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB49_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2330,10 +2330,10 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2343,7 +2343,7 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB50_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2375,10 +2375,10 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -2389,7 +2389,7 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB51_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2421,10 +2421,10 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -2435,7 +2435,7 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB52_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2467,10 +2467,10 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -2481,7 +2481,7 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB53_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2513,10 +2513,10 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2526,7 +2526,7 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB54_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2558,10 +2558,10 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2571,7 +2571,7 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB55_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2603,10 +2603,10 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2616,7 +2616,7 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB56_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2648,10 +2648,10 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2661,7 +2661,7 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB57_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2693,10 +2693,10 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2706,7 +2706,7 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB58_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2738,10 +2738,10 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2751,7 +2751,7 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB59_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2783,10 +2783,10 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -2797,7 +2797,7 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2829,10 +2829,10 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -2843,7 +2843,7 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB61_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2875,10 +2875,10 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -2889,7 +2889,7 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB62_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2921,10 +2921,10 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -2935,7 +2935,7 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB63_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2966,10 +2966,10 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -2980,7 +2980,7 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3011,10 +3011,10 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3025,7 +3025,7 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3056,10 +3056,10 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [release_acquire_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3070,7 +3070,7 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3102,10 +3102,10 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [release_acquire_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3116,7 +3116,7 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB67_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3148,10 +3148,10 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [release_acquire_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3162,7 +3162,7 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3194,10 +3194,10 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3208,7 +3208,7 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3240,10 +3240,10 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3254,7 +3254,7 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3286,10 +3286,10 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3300,7 +3300,7 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3332,10 +3332,10 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3346,7 +3346,7 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3378,10 +3378,10 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3392,7 +3392,7 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3424,10 +3424,10 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3438,7 +3438,7 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3470,10 +3470,10 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3484,7 +3484,7 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3516,10 +3516,10 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3530,7 +3530,7 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3562,10 +3562,10 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3576,7 +3576,7 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3608,10 +3608,10 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3622,7 +3622,7 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3654,10 +3654,10 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3668,7 +3668,7 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3700,10 +3700,10 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3714,7 +3714,7 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3746,10 +3746,10 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3760,7 +3760,7 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3792,10 +3792,10 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3806,7 +3806,7 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3838,10 +3838,10 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3852,7 +3852,7 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3884,10 +3884,10 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3898,7 +3898,7 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3930,10 +3930,10 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3944,7 +3944,7 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3976,10 +3976,10 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -3990,7 +3990,7 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -4022,10 +4022,10 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -4036,7 +4036,7 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -4068,10 +4068,10 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -4082,7 +4082,7 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -4114,10 +4114,10 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -4128,7 +4128,7 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -4158,9 +4158,9 @@ define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2];
 ; SM70-NEXT:    atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4175,9 +4175,9 @@ define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2];
 ; SM70-NEXT:    atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4192,9 +4192,9 @@ define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2];
 ; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4209,9 +4209,9 @@ define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4226,9 +4226,9 @@ define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4243,9 +4243,9 @@ define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4260,10 +4260,10 @@ define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4278,10 +4278,10 @@ define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4296,10 +4296,10 @@ define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4314,9 +4314,9 @@ define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4331,9 +4331,9 @@ define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4348,9 +4348,9 @@ define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4365,9 +4365,9 @@ define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4382,9 +4382,9 @@ define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4399,9 +4399,9 @@ define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4416,10 +4416,10 @@ define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4434,10 +4434,10 @@ define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4452,10 +4452,10 @@ define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4470,9 +4470,9 @@ define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_param_2];
 ; SM70-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4487,9 +4487,9 @@ define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_global_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_param_2];
 ; SM70-NEXT:    atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4504,9 +4504,9 @@ define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_param_2];
 ; SM70-NEXT:    atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4521,9 +4521,9 @@ define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_generic_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [release_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [release_acquire_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_param_2];
 ; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4538,9 +4538,9 @@ define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [release_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [release_acquire_i32_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_param_2];
 ; SM70-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4555,9 +4555,9 @@ define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_shared_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [release_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [release_acquire_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_param_2];
 ; SM70-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4572,10 +4572,10 @@ define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4590,10 +4590,10 @@ define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_global_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4608,10 +4608,10 @@ define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4626,9 +4626,9 @@ define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2];
 ; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4643,9 +4643,9 @@ define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2];
 ; SM70-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4660,9 +4660,9 @@ define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2];
 ; SM70-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4677,9 +4677,9 @@ define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2];
 ; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4694,9 +4694,9 @@ define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
 ; SM70-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4711,9 +4711,9 @@ define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2];
 ; SM70-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4728,10 +4728,10 @@ define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4746,10 +4746,10 @@ define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4764,10 +4764,10 @@ define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4782,10 +4782,10 @@ define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4800,10 +4800,10 @@ define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4818,10 +4818,10 @@ define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4836,10 +4836,10 @@ define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4854,10 +4854,10 @@ define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4872,10 +4872,10 @@ define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4890,10 +4890,10 @@ define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4908,10 +4908,10 @@ define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4926,10 +4926,10 @@ define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -4943,9 +4943,9 @@ define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2];
 ; SM70-NEXT:    atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -4959,9 +4959,9 @@ define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2];
 ; SM70-NEXT:    atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -4975,9 +4975,9 @@ define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2];
 ; SM70-NEXT:    atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -4991,9 +4991,9 @@ define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5007,9 +5007,9 @@ define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5023,9 +5023,9 @@ define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5039,10 +5039,10 @@ define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5056,10 +5056,10 @@ define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5073,10 +5073,10 @@ define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5090,9 +5090,9 @@ define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5106,9 +5106,9 @@ define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5122,9 +5122,9 @@ define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5138,9 +5138,9 @@ define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5154,9 +5154,9 @@ define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5170,9 +5170,9 @@ define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5186,10 +5186,10 @@ define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5203,10 +5203,10 @@ define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5220,10 +5220,10 @@ define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5237,9 +5237,9 @@ define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2];
 ; SM70-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5253,9 +5253,9 @@ define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_global_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_param_2];
 ; SM70-NEXT:    atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5269,9 +5269,9 @@ define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2];
 ; SM70-NEXT:    atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5285,9 +5285,9 @@ define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_generic_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_param_2];
 ; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5301,9 +5301,9 @@ define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_global_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_param_2];
 ; SM70-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5317,9 +5317,9 @@ define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_shared_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_param_2];
 ; SM70-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5333,10 +5333,10 @@ define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5350,10 +5350,10 @@ define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5367,10 +5367,10 @@ define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5384,9 +5384,9 @@ define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
 ; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5400,9 +5400,9 @@ define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2];
 ; SM70-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5416,9 +5416,9 @@ define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
 ; SM70-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5432,9 +5432,9 @@ define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2];
 ; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5448,9 +5448,9 @@ define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2];
 ; SM70-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5464,9 +5464,9 @@ define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2];
 ; SM70-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5480,10 +5480,10 @@ define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5497,10 +5497,10 @@ define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5514,10 +5514,10 @@ define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5531,10 +5531,10 @@ define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5548,10 +5548,10 @@ define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5565,10 +5565,10 @@ define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5582,10 +5582,10 @@ define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5599,10 +5599,10 @@ define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5616,10 +5616,10 @@ define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5633,10 +5633,10 @@ define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5650,10 +5650,10 @@ define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
 ; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -5667,10 +5667,10 @@ define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
 ; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
index 6df7b3d695f7d..f289c3cf3d509 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
@@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -23,9 +23,9 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -68,9 +68,9 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -113,9 +113,9 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -158,9 +158,9 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -192,8 +192,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -204,9 +204,9 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -238,8 +238,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -250,9 +250,9 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -284,8 +284,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -297,9 +297,9 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -331,8 +331,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -344,9 +344,9 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -378,8 +378,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -391,9 +391,9 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -425,8 +425,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -437,9 +437,9 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -471,8 +471,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -483,9 +483,9 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -517,8 +517,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -529,9 +529,9 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -563,8 +563,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -575,9 +575,9 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -609,8 +609,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -621,9 +621,9 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -655,8 +655,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -667,9 +667,9 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -701,8 +701,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -714,9 +714,9 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -748,8 +748,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -761,9 +761,9 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -795,8 +795,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -808,9 +808,9 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -842,8 +842,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -855,9 +855,9 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -888,8 +888,8 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -901,9 +901,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -934,8 +934,8 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -947,9 +947,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -980,8 +980,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -993,9 +993,9 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1027,8 +1027,8 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1040,9 +1040,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1074,8 +1074,8 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1087,9 +1087,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1121,8 +1121,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1134,9 +1134,9 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1168,8 +1168,8 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1181,9 +1181,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1215,8 +1215,8 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1228,9 +1228,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1262,8 +1262,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1275,9 +1275,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1309,8 +1309,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1322,9 +1322,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1356,8 +1356,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1369,9 +1369,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1403,8 +1403,8 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1416,9 +1416,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1450,8 +1450,8 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1463,9 +1463,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB31_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1497,8 +1497,8 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1510,9 +1510,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1544,8 +1544,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1557,9 +1557,9 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1591,8 +1591,8 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1604,9 +1604,9 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1638,8 +1638,8 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1651,9 +1651,9 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB35_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1685,8 +1685,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1698,9 +1698,9 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB36_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1732,8 +1732,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1745,9 +1745,9 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1779,8 +1779,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1792,9 +1792,9 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1826,8 +1826,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1839,9 +1839,9 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1873,8 +1873,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1886,9 +1886,9 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB40_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1920,8 +1920,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1933,9 +1933,9 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1967,8 +1967,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -1980,9 +1980,9 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2014,8 +2014,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -2027,9 +2027,9 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2061,8 +2061,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -2074,9 +2074,9 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB44_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2108,10 +2108,10 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2121,7 +2121,7 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB45_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2152,10 +2152,10 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2165,7 +2165,7 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB46_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2196,10 +2196,10 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2209,7 +2209,7 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB47_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2240,10 +2240,10 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2253,7 +2253,7 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB48_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2285,10 +2285,10 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2298,7 +2298,7 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB49_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2330,10 +2330,10 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2343,7 +2343,7 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB50_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2375,10 +2375,10 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -2389,7 +2389,7 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB51_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2421,10 +2421,10 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -2435,7 +2435,7 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB52_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2467,10 +2467,10 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -2481,7 +2481,7 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB53_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2513,10 +2513,10 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2526,7 +2526,7 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB54_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2558,10 +2558,10 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2571,7 +2571,7 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB55_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2603,10 +2603,10 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2616,7 +2616,7 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB56_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2648,10 +2648,10 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2661,7 +2661,7 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB57_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2693,10 +2693,10 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_global_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2706,7 +2706,7 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB58_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2738,10 +2738,10 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -2751,7 +2751,7 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB59_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2783,10 +2783,10 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -2797,7 +2797,7 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2829,10 +2829,10 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -2843,7 +2843,7 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB61_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2875,10 +2875,10 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -2889,7 +2889,7 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB62_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2921,10 +2921,10 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -2935,7 +2935,7 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB63_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -2966,10 +2966,10 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -2980,7 +2980,7 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3011,10 +3011,10 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [release_monotonic_i16_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3025,7 +3025,7 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3056,10 +3056,10 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_generic_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3070,7 +3070,7 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3102,10 +3102,10 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3116,7 +3116,7 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB67_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3148,10 +3148,10 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_acquire_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i16_shared_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [release_acquire_i16_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3162,7 +3162,7 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3194,10 +3194,10 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3208,7 +3208,7 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3240,10 +3240,10 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3254,7 +3254,7 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3286,10 +3286,10 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3300,7 +3300,7 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3332,10 +3332,10 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3346,7 +3346,7 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3378,10 +3378,10 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3392,7 +3392,7 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3424,10 +3424,10 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3438,7 +3438,7 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3470,10 +3470,10 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3484,7 +3484,7 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3516,10 +3516,10 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3530,7 +3530,7 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3562,10 +3562,10 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3576,7 +3576,7 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3608,10 +3608,10 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3622,7 +3622,7 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3654,10 +3654,10 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3668,7 +3668,7 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3700,10 +3700,10 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3714,7 +3714,7 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3746,10 +3746,10 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3760,7 +3760,7 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3792,10 +3792,10 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3806,7 +3806,7 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3838,10 +3838,10 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3852,7 +3852,7 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3884,10 +3884,10 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3898,7 +3898,7 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3930,10 +3930,10 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3944,7 +3944,7 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -3976,10 +3976,10 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -3990,7 +3990,7 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -4022,10 +4022,10 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -4036,7 +4036,7 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -4068,10 +4068,10 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -4082,7 +4082,7 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.global.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -4114,10 +4114,10 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -4128,7 +4128,7 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.shared.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -4158,9 +4158,9 @@ define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2];
 ; SM90-NEXT:    atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4175,9 +4175,9 @@ define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2];
 ; SM90-NEXT:    atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4192,9 +4192,9 @@ define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2];
 ; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4209,9 +4209,9 @@ define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4226,9 +4226,9 @@ define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4243,9 +4243,9 @@ define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4260,10 +4260,10 @@ define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4278,10 +4278,10 @@ define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4296,10 +4296,10 @@ define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4314,9 +4314,9 @@ define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4331,9 +4331,9 @@ define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4348,9 +4348,9 @@ define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4365,9 +4365,9 @@ define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4382,9 +4382,9 @@ define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4399,9 +4399,9 @@ define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4416,10 +4416,10 @@ define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4434,10 +4434,10 @@ define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4452,10 +4452,10 @@ define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4470,9 +4470,9 @@ define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_generic_param_2];
 ; SM90-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4487,9 +4487,9 @@ define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_global_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_global_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_global_param_2];
 ; SM90-NEXT:    atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4504,9 +4504,9 @@ define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [release_monotonic_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_monotonic_i32_shared_param_2];
 ; SM90-NEXT:    atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4521,9 +4521,9 @@ define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_generic_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_generic_param_2];
 ; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4538,9 +4538,9 @@ define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_global_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_global_param_2];
 ; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4555,9 +4555,9 @@ define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_shared_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [release_acquire_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_acquire_i32_shared_param_2];
 ; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4572,10 +4572,10 @@ define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4590,10 +4590,10 @@ define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_global_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4608,10 +4608,10 @@ define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4626,9 +4626,9 @@ define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2];
 ; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4643,9 +4643,9 @@ define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2];
 ; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4660,9 +4660,9 @@ define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2];
 ; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4677,9 +4677,9 @@ define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2];
 ; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4694,9 +4694,9 @@ define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2];
 ; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4711,9 +4711,9 @@ define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2];
 ; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4728,10 +4728,10 @@ define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4746,10 +4746,10 @@ define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4764,10 +4764,10 @@ define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4782,10 +4782,10 @@ define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4800,10 +4800,10 @@ define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4818,10 +4818,10 @@ define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4836,10 +4836,10 @@ define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4854,10 +4854,10 @@ define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4872,10 +4872,10 @@ define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4890,10 +4890,10 @@ define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4908,10 +4908,10 @@ define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4926,10 +4926,10 @@ define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -4943,9 +4943,9 @@ define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2];
 ; SM90-NEXT:    atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -4959,9 +4959,9 @@ define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2];
 ; SM90-NEXT:    atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -4975,9 +4975,9 @@ define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2];
 ; SM90-NEXT:    atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -4991,9 +4991,9 @@ define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5007,9 +5007,9 @@ define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5023,9 +5023,9 @@ define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5039,10 +5039,10 @@ define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5056,10 +5056,10 @@ define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5073,10 +5073,10 @@ define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5090,9 +5090,9 @@ define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5106,9 +5106,9 @@ define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5122,9 +5122,9 @@ define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5138,9 +5138,9 @@ define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5154,9 +5154,9 @@ define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5170,9 +5170,9 @@ define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5186,10 +5186,10 @@ define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5203,10 +5203,10 @@ define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5220,10 +5220,10 @@ define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5237,9 +5237,9 @@ define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2];
 ; SM90-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5253,9 +5253,9 @@ define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_global_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_global_param_2];
 ; SM90-NEXT:    atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5269,9 +5269,9 @@ define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2];
 ; SM90-NEXT:    atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5285,9 +5285,9 @@ define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_generic_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_generic_param_2];
 ; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5301,9 +5301,9 @@ define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_global_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_global_param_2];
 ; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5317,9 +5317,9 @@ define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_shared_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_acquire_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_acquire_i64_shared_param_2];
 ; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5333,10 +5333,10 @@ define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5350,10 +5350,10 @@ define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5367,10 +5367,10 @@ define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5384,9 +5384,9 @@ define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
 ; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5400,9 +5400,9 @@ define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2];
 ; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5416,9 +5416,9 @@ define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
 ; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5432,9 +5432,9 @@ define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2];
 ; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5448,9 +5448,9 @@ define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2];
 ; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5464,9 +5464,9 @@ define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2];
 ; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5480,10 +5480,10 @@ define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5497,10 +5497,10 @@ define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5514,10 +5514,10 @@ define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5531,10 +5531,10 @@ define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5548,10 +5548,10 @@ define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5565,10 +5565,10 @@ define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5582,10 +5582,10 @@ define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5599,10 +5599,10 @@ define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5616,10 +5616,10 @@ define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5633,10 +5633,10 @@ define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5650,10 +5650,10 @@ define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
 ; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -5667,10 +5667,10 @@ define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
 ; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
index e5f05e49d2fef..9eeff9d7c2b75 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
@@ -18,8 +18,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
-; SM30-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM30-NEXT:    ld.param.b8 %rs1, [relaxed_sys_i8_param_2];
+; SM30-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i8_param_0];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM30-NEXT:    and.b32 %r10, %r9, 3;
@@ -30,9 +30,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM30-NEXT:    and.b32 %r14, %r13, 255;
 ; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM30-NEXT:    ld.param.b8 %r15, [relaxed_sys_i8_param_1];
 ; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM30-NEXT:    ld.u32 %r16, [%rd1];
+; SM30-NEXT:    ld.b32 %r16, [%rd1];
 ; SM30-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM30-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -59,8 +59,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [relaxed_sys_i8_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i8_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -71,9 +71,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [relaxed_sys_i8_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -99,8 +99,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [relaxed_sys_i8_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i8_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -111,9 +111,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [relaxed_sys_i8_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -144,8 +144,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u8 %rs1, [acquire_sys_i8_param_2];
-; SM30-NEXT:    ld.param.u64 %rd2, [acquire_sys_i8_param_0];
+; SM30-NEXT:    ld.param.b8 %rs1, [acquire_sys_i8_param_2];
+; SM30-NEXT:    ld.param.b64 %rd2, [acquire_sys_i8_param_0];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM30-NEXT:    and.b32 %r10, %r9, 3;
@@ -156,9 +156,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM30-NEXT:    and.b32 %r14, %r13, 255;
 ; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    ld.param.u8 %r15, [acquire_sys_i8_param_1];
+; SM30-NEXT:    ld.param.b8 %r15, [acquire_sys_i8_param_1];
 ; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM30-NEXT:    ld.u32 %r16, [%rd1];
+; SM30-NEXT:    ld.b32 %r16, [%rd1];
 ; SM30-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM30-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -186,8 +186,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acquire_sys_i8_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_sys_i8_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acquire_sys_i8_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_sys_i8_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM70-NEXT:    and.b32 %r10, %r9, 3;
@@ -198,9 +198,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acquire_sys_i8_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acquire_sys_i8_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -227,8 +227,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acquire_sys_i8_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_sys_i8_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acquire_sys_i8_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_sys_i8_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
 ; SM90-NEXT:    and.b32 %r10, %r9, 3;
@@ -239,9 +239,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acquire_sys_i8_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acquire_sys_i8_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -273,8 +273,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u8 %rs1, [release_sys_i8_param_2];
-; SM30-NEXT:    ld.param.u64 %rd2, [release_sys_i8_param_0];
+; SM30-NEXT:    ld.param.b8 %rs1, [release_sys_i8_param_2];
+; SM30-NEXT:    ld.param.b64 %rd2, [release_sys_i8_param_0];
 ; SM30-NEXT:    membar.sys;
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -286,9 +286,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM30-NEXT:    and.b32 %r14, %r13, 255;
 ; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    ld.param.u8 %r15, [release_sys_i8_param_1];
+; SM30-NEXT:    ld.param.b8 %r15, [release_sys_i8_param_1];
 ; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM30-NEXT:    ld.u32 %r16, [%rd1];
+; SM30-NEXT:    ld.b32 %r16, [%rd1];
 ; SM30-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM30-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -315,8 +315,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [release_sys_i8_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_sys_i8_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [release_sys_i8_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_sys_i8_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -328,9 +328,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [release_sys_i8_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [release_sys_i8_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -356,8 +356,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [release_sys_i8_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_sys_i8_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [release_sys_i8_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_sys_i8_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -369,9 +369,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [release_sys_i8_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [release_sys_i8_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -402,8 +402,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u8 %rs1, [acq_rel_sys_i8_param_2];
-; SM30-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i8_param_0];
+; SM30-NEXT:    ld.param.b8 %rs1, [acq_rel_sys_i8_param_2];
+; SM30-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i8_param_0];
 ; SM30-NEXT:    membar.sys;
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -415,9 +415,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM30-NEXT:    and.b32 %r14, %r13, 255;
 ; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    ld.param.u8 %r15, [acq_rel_sys_i8_param_1];
+; SM30-NEXT:    ld.param.b8 %r15, [acq_rel_sys_i8_param_1];
 ; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM30-NEXT:    ld.u32 %r16, [%rd1];
+; SM30-NEXT:    ld.b32 %r16, [%rd1];
 ; SM30-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM30-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -445,8 +445,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_sys_i8_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i8_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_sys_i8_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i8_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -458,9 +458,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_sys_i8_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_sys_i8_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -487,8 +487,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_sys_i8_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i8_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_sys_i8_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i8_param_0];
 ; SM90-NEXT:    fence.release.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -500,9 +500,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_sys_i8_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_sys_i8_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -534,8 +534,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u8 %rs1, [seq_cst_sys_i8_param_2];
-; SM30-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i8_param_0];
+; SM30-NEXT:    ld.param.b8 %rs1, [seq_cst_sys_i8_param_2];
+; SM30-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i8_param_0];
 ; SM30-NEXT:    membar.sys;
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -547,9 +547,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM30-NEXT:    and.b32 %r14, %r13, 255;
 ; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    ld.param.u8 %r15, [seq_cst_sys_i8_param_1];
+; SM30-NEXT:    ld.param.b8 %r15, [seq_cst_sys_i8_param_1];
 ; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM30-NEXT:    ld.u32 %r16, [%rd1];
+; SM30-NEXT:    ld.b32 %r16, [%rd1];
 ; SM30-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM30-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -577,8 +577,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_sys_i8_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i8_param_0];
+; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_sys_i8_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i8_param_0];
 ; SM70-NEXT:    fence.sc.sys;
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -590,9 +590,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    and.b32 %r14, %r13, 255;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_sys_i8_param_1];
+; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_sys_i8_param_1];
 ; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -619,8 +619,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_sys_i8_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i8_param_0];
+; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_sys_i8_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i8_param_0];
 ; SM90-NEXT:    fence.sc.sys;
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
@@ -632,9 +632,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM90-NEXT:    and.b32 %r14, %r13, 255;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_sys_i8_param_1];
+; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_sys_i8_param_1];
 ; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -667,10 +667,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
-; SM30-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM30-NEXT:    ld.param.b16 %rs1, [relaxed_sys_i16_param_2];
+; SM30-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i16_param_0];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    ld.param.u16 %r9, [relaxed_sys_i16_param_1];
+; SM30-NEXT:    ld.param.b16 %r9, [relaxed_sys_i16_param_1];
 ; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM30-NEXT:    and.b32 %r11, %r10, 3;
 ; SM30-NEXT:    shl.b32 %r1, %r11, 3;
@@ -680,7 +680,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.u32 %r15, [%rd1];
+; SM30-NEXT:    ld.b32 %r15, [%rd1];
 ; SM30-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM30-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -707,10 +707,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [relaxed_sys_i16_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i16_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [relaxed_sys_i16_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [relaxed_sys_i16_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -720,7 +720,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -746,10 +746,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [relaxed_sys_i16_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i16_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [relaxed_sys_i16_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [relaxed_sys_i16_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -759,7 +759,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -790,10 +790,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u16 %rs1, [acquire_sys_i16_param_2];
-; SM30-NEXT:    ld.param.u64 %rd2, [acquire_sys_i16_param_0];
+; SM30-NEXT:    ld.param.b16 %rs1, [acquire_sys_i16_param_2];
+; SM30-NEXT:    ld.param.b64 %rd2, [acquire_sys_i16_param_0];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    ld.param.u16 %r9, [acquire_sys_i16_param_1];
+; SM30-NEXT:    ld.param.b16 %r9, [acquire_sys_i16_param_1];
 ; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM30-NEXT:    and.b32 %r11, %r10, 3;
 ; SM30-NEXT:    shl.b32 %r1, %r11, 3;
@@ -803,7 +803,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.u32 %r15, [%rd1];
+; SM30-NEXT:    ld.b32 %r15, [%rd1];
 ; SM30-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM30-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -831,10 +831,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acquire_sys_i16_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_sys_i16_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acquire_sys_i16_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_sys_i16_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [acquire_sys_i16_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acquire_sys_i16_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -844,7 +844,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -871,10 +871,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acquire_sys_i16_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_sys_i16_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acquire_sys_i16_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_sys_i16_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    ld.param.u16 %r9, [acquire_sys_i16_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acquire_sys_i16_param_1];
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
 ; SM90-NEXT:    shl.b32 %r1, %r11, 3;
@@ -884,7 +884,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -916,10 +916,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u16 %rs1, [release_sys_i16_param_2];
-; SM30-NEXT:    ld.param.u64 %rd2, [release_sys_i16_param_0];
+; SM30-NEXT:    ld.param.b16 %rs1, [release_sys_i16_param_2];
+; SM30-NEXT:    ld.param.b64 %rd2, [release_sys_i16_param_0];
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    ld.param.u16 %r9, [release_sys_i16_param_1];
+; SM30-NEXT:    ld.param.b16 %r9, [release_sys_i16_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM30-NEXT:    and.b32 %r11, %r10, 3;
@@ -930,7 +930,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.u32 %r15, [%rd1];
+; SM30-NEXT:    ld.b32 %r15, [%rd1];
 ; SM30-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM30-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -957,10 +957,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [release_sys_i16_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_sys_i16_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [release_sys_i16_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_sys_i16_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [release_sys_i16_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [release_sys_i16_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -971,7 +971,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -997,10 +997,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [release_sys_i16_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_sys_i16_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [release_sys_i16_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_sys_i16_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [release_sys_i16_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [release_sys_i16_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -1011,7 +1011,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1042,10 +1042,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u16 %rs1, [acq_rel_sys_i16_param_2];
-; SM30-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i16_param_0];
+; SM30-NEXT:    ld.param.b16 %rs1, [acq_rel_sys_i16_param_2];
+; SM30-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i16_param_0];
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    ld.param.u16 %r9, [acq_rel_sys_i16_param_1];
+; SM30-NEXT:    ld.param.b16 %r9, [acq_rel_sys_i16_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM30-NEXT:    and.b32 %r11, %r10, 3;
@@ -1056,7 +1056,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.u32 %r15, [%rd1];
+; SM30-NEXT:    ld.b32 %r15, [%rd1];
 ; SM30-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM30-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1084,10 +1084,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_sys_i16_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i16_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [acq_rel_sys_i16_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i16_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_sys_i16_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [acq_rel_sys_i16_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -1098,7 +1098,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1125,10 +1125,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_sys_i16_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i16_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [acq_rel_sys_i16_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i16_param_0];
 ; SM90-NEXT:    fence.release.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_sys_i16_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [acq_rel_sys_i16_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -1139,7 +1139,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1172,10 +1172,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u16 %rs1, [seq_cst_sys_i16_param_2];
-; SM30-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i16_param_0];
+; SM30-NEXT:    ld.param.b16 %rs1, [seq_cst_sys_i16_param_2];
+; SM30-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i16_param_0];
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    ld.param.u16 %r9, [seq_cst_sys_i16_param_1];
+; SM30-NEXT:    ld.param.b16 %r9, [seq_cst_sys_i16_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM30-NEXT:    and.b32 %r11, %r10, 3;
@@ -1186,7 +1186,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM30-NEXT:    ld.u32 %r15, [%rd1];
+; SM30-NEXT:    ld.b32 %r15, [%rd1];
 ; SM30-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM30-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1214,10 +1214,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_sys_i16_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i16_param_0];
+; SM70-NEXT:    ld.param.b16 %rs1, [seq_cst_sys_i16_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i16_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_sys_i16_param_1];
+; SM70-NEXT:    ld.param.b16 %r9, [seq_cst_sys_i16_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
@@ -1228,7 +1228,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    ld.b32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM70-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1255,10 +1255,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<3>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_sys_i16_param_2];
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i16_param_0];
+; SM90-NEXT:    ld.param.b16 %rs1, [seq_cst_sys_i16_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i16_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_sys_i16_param_1];
+; SM90-NEXT:    ld.param.b16 %r9, [seq_cst_sys_i16_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
 ; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM90-NEXT:    and.b32 %r11, %r10, 3;
@@ -1269,7 +1269,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
 ; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
-; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    ld.b32 %r15, [%rd1];
 ; SM90-NEXT:    and.b32 %r19, %r15, %r2;
 ; SM90-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -1300,9 +1300,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<2>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
-; SM30-NEXT:    ld.param.u32 %r1, [relaxed_sys_i32_param_1];
-; SM30-NEXT:    ld.param.u32 %r2, [relaxed_sys_i32_param_2];
+; SM30-NEXT:    ld.param.b64 %rd1, [relaxed_sys_i32_param_0];
+; SM30-NEXT:    ld.param.b32 %r1, [relaxed_sys_i32_param_1];
+; SM30-NEXT:    ld.param.b32 %r2, [relaxed_sys_i32_param_2];
 ; SM30-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM30-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM30-NEXT:    ret;
@@ -1313,9 +1313,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [relaxed_sys_i32_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [relaxed_sys_i32_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [relaxed_sys_i32_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [relaxed_sys_i32_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [relaxed_sys_i32_param_2];
 ; SM70-NEXT:    atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -1325,9 +1325,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [relaxed_sys_i32_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [relaxed_sys_i32_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [relaxed_sys_i32_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [relaxed_sys_i32_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [relaxed_sys_i32_param_2];
 ; SM90-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -1342,9 +1342,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<2>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
-; SM30-NEXT:    ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
-; SM30-NEXT:    ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
+; SM30-NEXT:    ld.param.b64 %rd1, [acq_rel_sys_i32_param_0];
+; SM30-NEXT:    ld.param.b32 %r1, [acq_rel_sys_i32_param_1];
+; SM30-NEXT:    ld.param.b32 %r2, [acq_rel_sys_i32_param_2];
 ; SM30-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM30-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM30-NEXT:    ret;
@@ -1355,9 +1355,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_sys_i32_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acq_rel_sys_i32_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acq_rel_sys_i32_param_2];
 ; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -1367,9 +1367,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_sys_i32_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acq_rel_sys_i32_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acq_rel_sys_i32_param_2];
 ; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -1384,9 +1384,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<2>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u64 %rd1, [acquire_sys_i32_param_0];
-; SM30-NEXT:    ld.param.u32 %r1, [acquire_sys_i32_param_1];
-; SM30-NEXT:    ld.param.u32 %r2, [acquire_sys_i32_param_2];
+; SM30-NEXT:    ld.param.b64 %rd1, [acquire_sys_i32_param_0];
+; SM30-NEXT:    ld.param.b32 %r1, [acquire_sys_i32_param_1];
+; SM30-NEXT:    ld.param.b32 %r2, [acquire_sys_i32_param_2];
 ; SM30-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM30-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM30-NEXT:    ret;
@@ -1397,9 +1397,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_sys_i32_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [acquire_sys_i32_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [acquire_sys_i32_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_sys_i32_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [acquire_sys_i32_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [acquire_sys_i32_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -1409,9 +1409,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_sys_i32_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [acquire_sys_i32_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [acquire_sys_i32_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_sys_i32_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [acquire_sys_i32_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [acquire_sys_i32_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -1426,9 +1426,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<2>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u64 %rd1, [release_sys_i32_param_0];
-; SM30-NEXT:    ld.param.u32 %r1, [release_sys_i32_param_1];
-; SM30-NEXT:    ld.param.u32 %r2, [release_sys_i32_param_2];
+; SM30-NEXT:    ld.param.b64 %rd1, [release_sys_i32_param_0];
+; SM30-NEXT:    ld.param.b32 %r1, [release_sys_i32_param_1];
+; SM30-NEXT:    ld.param.b32 %r2, [release_sys_i32_param_2];
 ; SM30-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM30-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM30-NEXT:    ret;
@@ -1439,9 +1439,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_sys_i32_param_0];
-; SM70-NEXT:    ld.param.u32 %r1, [release_sys_i32_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [release_sys_i32_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_sys_i32_param_0];
+; SM70-NEXT:    ld.param.b32 %r1, [release_sys_i32_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [release_sys_i32_param_2];
 ; SM70-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -1451,9 +1451,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_sys_i32_param_0];
-; SM90-NEXT:    ld.param.u32 %r1, [release_sys_i32_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [release_sys_i32_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_sys_i32_param_0];
+; SM90-NEXT:    ld.param.b32 %r1, [release_sys_i32_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [release_sys_i32_param_2];
 ; SM90-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM90-NEXT:    ret;
@@ -1468,10 +1468,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<2>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i32_param_0];
+; SM30-NEXT:    ld.param.b64 %rd1, [seq_cst_sys_i32_param_0];
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
-; SM30-NEXT:    ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
+; SM30-NEXT:    ld.param.b32 %r1, [seq_cst_sys_i32_param_1];
+; SM30-NEXT:    ld.param.b32 %r2, [seq_cst_sys_i32_param_2];
 ; SM30-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM30-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM30-NEXT:    ret;
@@ -1482,10 +1482,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i32_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_sys_i32_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
-; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
+; SM70-NEXT:    ld.param.b32 %r1, [seq_cst_sys_i32_param_1];
+; SM70-NEXT:    ld.param.b32 %r2, [seq_cst_sys_i32_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
@@ -1495,10 +1495,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<2>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i32_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_sys_i32_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
-; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
+; SM90-NEXT:    ld.param.b32 %r1, [seq_cst_sys_i32_param_1];
+; SM90-NEXT:    ld.param.b32 %r2, [seq_cst_sys_i32_param_2];
 ; SM90-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
@@ -1514,9 +1514,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<5>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
-; SM30-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
-; SM30-NEXT:    ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
+; SM30-NEXT:    ld.param.b64 %rd1, [relaxed_sys_i64_param_0];
+; SM30-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i64_param_1];
+; SM30-NEXT:    ld.param.b64 %rd3, [relaxed_sys_i64_param_2];
 ; SM30-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM30-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM30-NEXT:    ret;
@@ -1526,9 +1526,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [relaxed_sys_i64_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i64_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [relaxed_sys_i64_param_2];
 ; SM70-NEXT:    atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -1537,9 +1537,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [relaxed_sys_i64_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i64_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [relaxed_sys_i64_param_2];
 ; SM90-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -1553,9 +1553,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<5>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u64 %rd1, [acquire_sys_i64_param_0];
-; SM30-NEXT:    ld.param.u64 %rd2, [acquire_sys_i64_param_1];
-; SM30-NEXT:    ld.param.u64 %rd3, [acquire_sys_i64_param_2];
+; SM30-NEXT:    ld.param.b64 %rd1, [acquire_sys_i64_param_0];
+; SM30-NEXT:    ld.param.b64 %rd2, [acquire_sys_i64_param_1];
+; SM30-NEXT:    ld.param.b64 %rd3, [acquire_sys_i64_param_2];
 ; SM30-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM30-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM30-NEXT:    ret;
@@ -1565,9 +1565,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acquire_sys_i64_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acquire_sys_i64_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acquire_sys_i64_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acquire_sys_i64_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acquire_sys_i64_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acquire_sys_i64_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -1576,9 +1576,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acquire_sys_i64_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acquire_sys_i64_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acquire_sys_i64_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acquire_sys_i64_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acquire_sys_i64_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acquire_sys_i64_param_2];
 ; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -1592,9 +1592,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<5>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
-; SM30-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
-; SM30-NEXT:    ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
+; SM30-NEXT:    ld.param.b64 %rd1, [acq_rel_sys_i64_param_0];
+; SM30-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i64_param_1];
+; SM30-NEXT:    ld.param.b64 %rd3, [acq_rel_sys_i64_param_2];
 ; SM30-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM30-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM30-NEXT:    ret;
@@ -1604,9 +1604,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [acq_rel_sys_i64_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i64_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [acq_rel_sys_i64_param_2];
 ; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -1615,9 +1615,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [acq_rel_sys_i64_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i64_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [acq_rel_sys_i64_param_2];
 ; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -1631,9 +1631,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<5>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u64 %rd1, [release_sys_i64_param_0];
-; SM30-NEXT:    ld.param.u64 %rd2, [release_sys_i64_param_1];
-; SM30-NEXT:    ld.param.u64 %rd3, [release_sys_i64_param_2];
+; SM30-NEXT:    ld.param.b64 %rd1, [release_sys_i64_param_0];
+; SM30-NEXT:    ld.param.b64 %rd2, [release_sys_i64_param_1];
+; SM30-NEXT:    ld.param.b64 %rd3, [release_sys_i64_param_2];
 ; SM30-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM30-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM30-NEXT:    ret;
@@ -1643,9 +1643,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [release_sys_i64_param_0];
-; SM70-NEXT:    ld.param.u64 %rd2, [release_sys_i64_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [release_sys_i64_param_2];
+; SM70-NEXT:    ld.param.b64 %rd1, [release_sys_i64_param_0];
+; SM70-NEXT:    ld.param.b64 %rd2, [release_sys_i64_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [release_sys_i64_param_2];
 ; SM70-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -1654,9 +1654,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [release_sys_i64_param_0];
-; SM90-NEXT:    ld.param.u64 %rd2, [release_sys_i64_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [release_sys_i64_param_2];
+; SM90-NEXT:    ld.param.b64 %rd1, [release_sys_i64_param_0];
+; SM90-NEXT:    ld.param.b64 %rd2, [release_sys_i64_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [release_sys_i64_param_2];
 ; SM90-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM90-NEXT:    ret;
@@ -1670,10 +1670,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<5>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i64_param_0];
+; SM30-NEXT:    ld.param.b64 %rd1, [seq_cst_sys_i64_param_0];
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
-; SM30-NEXT:    ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
+; SM30-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i64_param_1];
+; SM30-NEXT:    ld.param.b64 %rd3, [seq_cst_sys_i64_param_2];
 ; SM30-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM30-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM30-NEXT:    ret;
@@ -1683,10 +1683,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<5>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i64_param_0];
+; SM70-NEXT:    ld.param.b64 %rd1, [seq_cst_sys_i64_param_0];
 ; SM70-NEXT:    fence.sc.sys;
-; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
-; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
+; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i64_param_1];
+; SM70-NEXT:    ld.param.b64 %rd3, [seq_cst_sys_i64_param_2];
 ; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
@@ -1695,10 +1695,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM90-NEXT:    .reg .b64 %rd<5>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
-; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i64_param_0];
+; SM90-NEXT:    ld.param.b64 %rd1, [seq_cst_sys_i64_param_0];
 ; SM90-NEXT:    fence.sc.sys;
-; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
-; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
+; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i64_param_1];
+; SM90-NEXT:    ld.param.b64 %rd3, [seq_cst_sys_i64_param_2];
 ; SM90-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM90-NEXT:    fence.acquire.sys;
 ; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll
index 319cadcb27f0f..dc6d504c2c66c 100644
--- a/llvm/test/CodeGen/NVPTX/combine-mad.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll
@@ -11,8 +11,8 @@ define i32 @test1(i32 %n, i32 %m) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test1_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test1_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test1_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test1_param_1];
 ; CHECK-NEXT:    mad.lo.s32 %r3, %r2, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -28,8 +28,8 @@ define i32 @test1_rev(i32 %n, i32 %m) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test1_rev_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test1_rev_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test1_rev_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test1_rev_param_1];
 ; CHECK-NEXT:    mad.lo.s32 %r3, %r2, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -47,9 +47,9 @@ define i32 @test2(i32 %n, i32 %m, i32 %s) {
 ; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test2_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test2_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test2_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test2_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test2_param_2];
 ; CHECK-NEXT:    setp.lt.s32 %p1, %r3, 1;
 ; CHECK-NEXT:    mad.lo.s32 %r4, %r2, %r1, %r2;
 ; CHECK-NEXT:    selp.b32 %r5, %r2, %r4, %p1;
@@ -71,9 +71,9 @@ define i32 @test2_rev1(i32 %n, i32 %m, i32 %s) {
 ; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test2_rev1_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test2_rev1_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test2_rev1_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test2_rev1_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test2_rev1_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test2_rev1_param_2];
 ; CHECK-NEXT:    setp.lt.s32 %p1, %r3, 1;
 ; CHECK-NEXT:    mad.lo.s32 %r4, %r2, %r1, %r2;
 ; CHECK-NEXT:    selp.b32 %r5, %r4, %r2, %p1;
@@ -95,9 +95,9 @@ define i32 @test2_rev2(i32 %n, i32 %m, i32 %s) {
 ; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test2_rev2_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test2_rev2_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test2_rev2_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test2_rev2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test2_rev2_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test2_rev2_param_2];
 ; CHECK-NEXT:    setp.lt.s32 %p1, %r3, 1;
 ; CHECK-NEXT:    mad.lo.s32 %r4, %r2, %r1, %r2;
 ; CHECK-NEXT:    selp.b32 %r5, %r4, %r2, %p1;
@@ -119,10 +119,10 @@ define i32 @test3(i32 %n, i32 %m, i32 %s) {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test3_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test3_param_0];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 3;
-; CHECK-NEXT:    ld.param.u32 %r3, [test3_param_1];
-; CHECK-NEXT:    ld.param.u32 %r4, [test3_param_2];
+; CHECK-NEXT:    ld.param.b32 %r3, [test3_param_1];
+; CHECK-NEXT:    ld.param.b32 %r4, [test3_param_2];
 ; CHECK-NEXT:    setp.lt.s32 %p1, %r4, 1;
 ; CHECK-NEXT:    selp.b32 %r5, 1, %r2, %p1;
 ; CHECK-NEXT:    mul.lo.s32 %r6, %r5, %r3;
@@ -144,12 +144,12 @@ define i32 @test4(i32 %a, i32 %b, i32 %c, i1 %p) {
 ; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [test4_param_3];
+; CHECK-NEXT:    ld.param.b8 %rs1, [test4_param_3];
 ; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT:    ld.param.u32 %r1, [test4_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test4_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test4_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test4_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test4_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test4_param_2];
 ; CHECK-NEXT:    mad.lo.s32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    selp.b32 %r5, %r4, %r3, %p1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
@@ -168,12 +168,12 @@ define i32 @test4_rev(i32 %a, i32 %b, i32 %c, i1 %p) {
 ; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [test4_rev_param_3];
+; CHECK-NEXT:    ld.param.b8 %rs1, [test4_rev_param_3];
 ; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT:    ld.param.u32 %r1, [test4_rev_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test4_rev_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test4_rev_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test4_rev_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test4_rev_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test4_rev_param_2];
 ; CHECK-NEXT:    mad.lo.s32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    selp.b32 %r5, %r3, %r4, %p1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
@@ -192,10 +192,10 @@ define i32 @test_mad_multi_use(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_mad_multi_use_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_mad_multi_use_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_mad_multi_use_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_mad_multi_use_param_1];
 ; CHECK-NEXT:    mul.lo.s32 %r3, %r1, %r2;
-; CHECK-NEXT:    ld.param.u32 %r4, [test_mad_multi_use_param_2];
+; CHECK-NEXT:    ld.param.b32 %r4, [test_mad_multi_use_param_2];
 ; CHECK-NEXT:    add.s32 %r5, %r3, %r4;
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .b32 param0;
@@ -227,7 +227,7 @@ define i32 @test_mad_fold(i32 %x) {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_mad_fold_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_mad_fold_param_0];
 ; CHECK-NEXT:    mul.hi.s32 %r2, %r1, -2147221471;
 ; CHECK-NEXT:    add.s32 %r3, %r2, %r1;
 ; CHECK-NEXT:    shr.u32 %r4, %r3, 31;
diff --git a/llvm/test/CodeGen/NVPTX/convert-fp-i8.ll b/llvm/test/CodeGen/NVPTX/convert-fp-i8.ll
index 670e112c26c76..cfb064c85e074 100644
--- a/llvm/test/CodeGen/NVPTX/convert-fp-i8.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-fp-i8.ll
@@ -11,7 +11,7 @@ define i8 @cvt_u8_f32(float %x) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_u8_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_u8_f32_param_0];
 ; CHECK-NEXT:    cvt.rzi.u16.f32 %rs1, %f1;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -28,7 +28,7 @@ define i8 @cvt_u8_f64(double %x) {
 ; CHECK-NEXT:    .reg .b64 %fd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [cvt_u8_f64_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [cvt_u8_f64_param_0];
 ; CHECK-NEXT:    cvt.rzi.u16.f64 %rs1, %fd1;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -44,9 +44,9 @@ define float @cvt_f32_i8(i8 %x) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [cvt_f32_i8_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [cvt_f32_i8_param_0];
 ; CHECK-NEXT:    cvt.rn.f32.u16 %f1, %rs1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f1;
 ; CHECK-NEXT:    ret;
   %a = uitofp i8 %x to float
   ret float %a
@@ -59,9 +59,9 @@ define double @cvt_f64_i8(i8 %x) {
 ; CHECK-NEXT:    .reg .b64 %fd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [cvt_f64_i8_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [cvt_f64_i8_param_0];
 ; CHECK-NEXT:    cvt.rn.f64.u16 %fd1, %rs1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd1;
 ; CHECK-NEXT:    ret;
   %a = uitofp i8 %x to double
   ret double %a
@@ -76,7 +76,7 @@ define float @cvt_f32_s8(i8 %x) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.s8 %rs1, [cvt_f32_s8_param_0];
 ; CHECK-NEXT:    cvt.rn.f32.s16 %f1, %rs1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f1;
 ; CHECK-NEXT:    ret;
   %a = sitofp i8 %x to float
   ret float %a
@@ -91,7 +91,7 @@ define double @cvt_f64_s8(i8 %x) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.s8 %rs1, [cvt_f64_s8_param_0];
 ; CHECK-NEXT:    cvt.rn.f64.s16 %fd1, %rs1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd1;
 ; CHECK-NEXT:    ret;
   %a = sitofp i8 %x to double
   ret double %a
@@ -105,7 +105,7 @@ define i8 @cvt_s8_f32(float %x) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_s8_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_s8_f32_param_0];
 ; CHECK-NEXT:    cvt.rzi.s16.f32 %rs1, %f1;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    and.b32 %r2, %r1, 255;
@@ -123,7 +123,7 @@ define i8 @cvt_s8_f64(double %x) {
 ; CHECK-NEXT:    .reg .b64 %fd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [cvt_s8_f64_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [cvt_s8_f64_param_0];
 ; CHECK-NEXT:    cvt.rzi.s16.f64 %rs1, %fd1;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    and.b32 %r2, %r1, 255;
diff --git a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
index 15dd899b714ac..ce6a16d9c0400 100644
--- a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
@@ -10,7 +10,7 @@
 ; i16
 
 define i16 @cvt_i16_i32(i32 %x) {
-; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i16_i32_param_{{[0-9]+}}]
+; CHECK: ld.param.b16 %r[[R0:[0-9]+]], [cvt_i16_i32_param_{{[0-9]+}}]
 ; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]]
 ; CHECK: ret
   %a = trunc i32 %x to i16
@@ -18,7 +18,7 @@ define i16 @cvt_i16_i32(i32 %x) {
 }
 
 define i16 @cvt_i16_i64(i64 %x) {
-; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i16_i64_param_{{[0-9]+}}]
+; CHECK: ld.param.b16 %r[[R0:[0-9]+]], [cvt_i16_i64_param_{{[0-9]+}}]
 ; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]]
 ; CHECK: ret
   %a = trunc i64 %x to i16
@@ -30,7 +30,7 @@ define i16 @cvt_i16_i64(i64 %x) {
 ; i32
 
 define i32 @cvt_i32_i16(i16 %x) {
-; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i32_i16_param_{{[0-9]+}}]
+; CHECK: ld.param.b16 %r[[R0:[0-9]+]], [cvt_i32_i16_param_{{[0-9]+}}]
 ; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]]
 ; CHECK: ret
   %a = zext i16 %x to i32
@@ -38,7 +38,7 @@ define i32 @cvt_i32_i16(i16 %x) {
 }
 
 define i32 @cvt_i32_i64(i64 %x) {
-; CHECK: ld.param.u32 %r[[R0:[0-9]+]], [cvt_i32_i64_param_{{[0-9]+}}]
+; CHECK: ld.param.b32 %r[[R0:[0-9]+]], [cvt_i32_i64_param_{{[0-9]+}}]
 ; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]]
 ; CHECK: ret
   %a = trunc i64 %x to i32
@@ -50,7 +50,7 @@ define i32 @cvt_i32_i64(i64 %x) {
 ; i64
 
 define i64 @cvt_i64_i16(i16 %x) {
-; CHECK: ld.param.u16 %rd[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}]
+; CHECK: ld.param.b16 %rd[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}]
 ; CHECK: st.param.b64 [func_retval{{[0-9]+}}], %rd[[R0]]
 ; CHECK: ret
   %a = zext i16 %x to i64
@@ -58,7 +58,7 @@ define i64 @cvt_i64_i16(i16 %x) {
 }
 
 define i64 @cvt_i64_i32(i32 %x) {
-; CHECK: ld.param.u32 %rd[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}]
+; CHECK: ld.param.b32 %rd[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}]
 ; CHECK: st.param.b64 [func_retval{{[0-9]+}}], %rd[[R0]]
 ; CHECK: ret
   %a = zext i32 %x to i64
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100.ll b/llvm/test/CodeGen/NVPTX/convert-sm100.ll
index 7230872b3427c..d5fe45f8051fb 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm100.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm100.ll
@@ -14,7 +14,7 @@ define i32 @cvt_rn_satf_tf32_f32(float %f1) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_satf_tf32_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_satf_tf32_f32_param_0];
 ; CHECK-NEXT:    cvt.rn.satfinite.tf32.f32 %r1, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -29,7 +29,7 @@ define i32 @cvt_rn_relu_satf_tf32_f32(float %f1) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_relu_satf_tf32_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_satf_tf32_f32_param_0];
 ; CHECK-NEXT:    cvt.rn.relu.satfinite.tf32.f32 %r1, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -44,7 +44,7 @@ define i32 @cvt_rz_satf_tf32_f32(float %f1) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_satf_tf32_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_satf_tf32_f32_param_0];
 ; CHECK-NEXT:    cvt.rz.satfinite.tf32.f32 %r1, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -59,7 +59,7 @@ define i32 @cvt_rz_relu_satf_tf32_f32(float %f1) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_relu_satf_tf32_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_relu_satf_tf32_f32_param_0];
 ; CHECK-NEXT:    cvt.rz.relu.satfinite.tf32.f32 %r1, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll
index 04d7a65f9e40e..def2575deb042 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll
@@ -14,8 +14,8 @@ define i16 @cvt_rn_sf_e2m3x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_sf_e2m3x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rn_sf_e2m3x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_sf_e2m3x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_sf_e2m3x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rn.satfinite.e2m3x2.f32 %rs1, %f1, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -32,8 +32,8 @@ define i16 @cvt_rn_relu_sf_e2m3x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_relu_sf_e2m3x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rn_relu_sf_e2m3x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_sf_e2m3x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_relu_sf_e2m3x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rn.satfinite.relu.e2m3x2.f32 %rs1, %f1, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -50,8 +50,8 @@ define i16 @cvt_rn_sf_e3m2x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_sf_e3m2x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rn_sf_e3m2x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_sf_e3m2x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_sf_e3m2x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rn.satfinite.e3m2x2.f32 %rs1, %f1, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -68,8 +68,8 @@ define i16 @cvt_rn_relu_sf_e3m2x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_relu_sf_e3m2x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rn_relu_sf_e3m2x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_sf_e3m2x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_relu_sf_e3m2x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rn.satfinite.relu.e3m2x2.f32 %rs1, %f1, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -85,7 +85,7 @@ define <2 x half> @cvt_rn_f16x2_e2m3x2(i16 %in) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [cvt_rn_f16x2_e2m3x2_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [cvt_rn_f16x2_e2m3x2_param_0];
 ; CHECK-NEXT:    cvt.rn.f16x2.e2m3x2 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -100,7 +100,7 @@ define <2 x half> @cvt_rn_relu_f16x2_e2m3x2_relu(i16 %in) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [cvt_rn_relu_f16x2_e2m3x2_relu_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [cvt_rn_relu_f16x2_e2m3x2_relu_param_0];
 ; CHECK-NEXT:    cvt.rn.relu.f16x2.e2m3x2 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -115,7 +115,7 @@ define <2 x half> @cvt_rn_f16x2_e3m2x2(i16 %in) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [cvt_rn_f16x2_e3m2x2_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [cvt_rn_f16x2_e3m2x2_param_0];
 ; CHECK-NEXT:    cvt.rn.f16x2.e3m2x2 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -130,7 +130,7 @@ define <2 x half> @cvt_rn_relu_f16x2_e3m2x2(i16 %in) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [cvt_rn_relu_f16x2_e3m2x2_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [cvt_rn_relu_f16x2_e3m2x2_param_0];
 ; CHECK-NEXT:    cvt.rn.relu.f16x2.e3m2x2 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -146,8 +146,8 @@ define i16 @cvt_rz_ue8m0x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_ue8m0x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rz_ue8m0x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_ue8m0x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rz_ue8m0x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rz.ue8m0x2.f32 %rs1, %f1, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -164,8 +164,8 @@ define i16 @cvt_rz_sf_ue8m0x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_sf_ue8m0x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rz_sf_ue8m0x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_sf_ue8m0x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rz_sf_ue8m0x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rz.satfinite.ue8m0x2.f32 %rs1, %f1, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -182,8 +182,8 @@ define i16 @cvt_rp_ue8m0x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rp_ue8m0x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rp_ue8m0x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rp_ue8m0x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rp_ue8m0x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rp.ue8m0x2.f32 %rs1, %f1, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -200,8 +200,8 @@ define i16 @cvt_rp_sf_ue8m0x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rp_sf_ue8m0x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rp_sf_ue8m0x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rp_sf_ue8m0x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rp_sf_ue8m0x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rp.satfinite.ue8m0x2.f32 %rs1, %f1, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -281,7 +281,7 @@ define <2 x bfloat> @cvt_bf16x2_ue8m0x2(i16 %in) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [cvt_bf16x2_ue8m0x2_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [cvt_bf16x2_ue8m0x2_param_0];
 ; CHECK-NEXT:    cvt.rn.bf16x2.ue8m0x2 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80.ll b/llvm/test/CodeGen/NVPTX/convert-sm80.ll
index eb7a6bdd222bb..0372d281ea355 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm80.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm80.ll
@@ -10,8 +10,8 @@ define <2 x bfloat> @cvt_rn_bf16x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_bf16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rn_bf16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_bf16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_bf16x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r1, %f1, %f2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -26,8 +26,8 @@ define <2 x bfloat> @cvt_rn_relu_bf16x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_relu_bf16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rn_relu_bf16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_bf16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_relu_bf16x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rn.relu.bf16x2.f32 %r1, %f1, %f2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -42,8 +42,8 @@ define <2 x bfloat> @cvt_rz_bf16x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_bf16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rz_bf16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_bf16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rz_bf16x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rz.bf16x2.f32 %r1, %f1, %f2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -58,8 +58,8 @@ define <2 x bfloat> @cvt_rz_relu_bf16x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_relu_bf16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rz_relu_bf16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_relu_bf16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rz_relu_bf16x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rz.relu.bf16x2.f32 %r1, %f1, %f2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -79,8 +79,8 @@ define <2 x half> @cvt_rn_f16x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_f16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rn_f16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_f16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_f16x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rn.f16x2.f32 %r1, %f1, %f2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -95,8 +95,8 @@ define <2 x half> @cvt_rn_relu_f16x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_relu_f16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rn_relu_f16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_f16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rn_relu_f16x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rn.relu.f16x2.f32 %r1, %f1, %f2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -111,8 +111,8 @@ define <2 x half> @cvt_rz_f16x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_f16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rz_f16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_f16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rz_f16x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rz.f16x2.f32 %r1, %f1, %f2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -127,8 +127,8 @@ define <2 x half> @cvt_rz_relu_f16x2_f32(float %f1, float %f2) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_relu_f16x2_f32_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [cvt_rz_relu_f16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_relu_f16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [cvt_rz_relu_f16x2_f32_param_1];
 ; CHECK-NEXT:    cvt.rz.relu.f16x2.f32 %r1, %f1, %f2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -148,7 +148,7 @@ define bfloat @cvt_rn_bf16_f32(float %f1) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_bf16_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_bf16_f32_param_0];
 ; CHECK-NEXT:    cvt.rn.bf16.f32 %rs1, %f1;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
@@ -163,7 +163,7 @@ define bfloat @cvt_rn_relu_bf16_f32(float %f1) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_relu_bf16_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_bf16_f32_param_0];
 ; CHECK-NEXT:    cvt.rn.relu.bf16.f32 %rs1, %f1;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
@@ -178,7 +178,7 @@ define bfloat @cvt_rz_bf16_f32(float %f1) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_bf16_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_bf16_f32_param_0];
 ; CHECK-NEXT:    cvt.rz.bf16.f32 %rs1, %f1;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
@@ -193,7 +193,7 @@ define bfloat @cvt_rz_relu_bf16_f32(float %f1) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_relu_bf16_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_relu_bf16_f32_param_0];
 ; CHECK-NEXT:    cvt.rz.relu.bf16.f32 %rs1, %f1;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
@@ -213,7 +213,7 @@ define i32 @cvt_rna_tf32_f32(float %f1) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rna_tf32_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rna_tf32_f32_param_0];
 ; CHECK-NEXT:    cvt.rna.tf32.f32 %r1, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -231,8 +231,8 @@ define <2 x bfloat> @fold_ff2bf16x2(float %lo, float %hi) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [fold_ff2bf16x2_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [fold_ff2bf16x2_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [fold_ff2bf16x2_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [fold_ff2bf16x2_param_1];
 ; CHECK-NEXT:    cvt.rn.bf16x2.f32 %r1, %f2, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -250,8 +250,8 @@ define <2 x half> @fold_ff2f16x2(float %lo, float %hi) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [fold_ff2f16x2_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [fold_ff2f16x2_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [fold_ff2f16x2_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [fold_ff2f16x2_param_1];
 ; CHECK-NEXT:    cvt.rn.f16x2.f32 %r1, %f2, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm90.ll b/llvm/test/CodeGen/NVPTX/convert-sm90.ll
index 340117f98cd94..dba8be1ef5a49 100644
--- a/llvm/test/CodeGen/NVPTX/convert-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-sm90.ll
@@ -14,7 +14,7 @@ define i32 @cvt_rn_tf32_f32(float %f1) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_tf32_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_tf32_f32_param_0];
 ; CHECK-NEXT:    cvt.rn.tf32.f32 %r1, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -29,7 +29,7 @@ define i32 @cvt_rn_relu_tf32_f32(float %f1) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_relu_tf32_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rn_relu_tf32_f32_param_0];
 ; CHECK-NEXT:    cvt.rn.relu.tf32.f32 %r1, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -44,7 +44,7 @@ define i32 @cvt_rz_tf32_f32(float %f1) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_tf32_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_tf32_f32_param_0];
 ; CHECK-NEXT:    cvt.rz.tf32.f32 %r1, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -59,7 +59,7 @@ define i32 @cvt_rz_relu_tf32_f32(float %f1) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_relu_tf32_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [cvt_rz_relu_tf32_f32_param_0];
 ; CHECK-NEXT:    cvt.rz.relu.tf32.f32 %r1, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/copysign.ll b/llvm/test/CodeGen/NVPTX/copysign.ll
index 2e305e683d777..d8198182220e9 100644
--- a/llvm/test/CodeGen/NVPTX/copysign.ll
+++ b/llvm/test/CodeGen/NVPTX/copysign.ll
@@ -11,10 +11,10 @@ define float @fcopysign_f_f(float %a, float %b) {
 ; CHECK-NEXT:    .reg .b32 %f<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [fcopysign_f_f_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [fcopysign_f_f_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [fcopysign_f_f_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [fcopysign_f_f_param_1];
 ; CHECK-NEXT:    copysign.f32 %f3, %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.copysign.f32(float %a, float %b)
   ret float %val
@@ -26,10 +26,10 @@ define double @fcopysign_d_d(double %a, double %b) {
 ; CHECK-NEXT:    .reg .b64 %fd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [fcopysign_d_d_param_0];
-; CHECK-NEXT:    ld.param.f64 %fd2, [fcopysign_d_d_param_1];
+; CHECK-NEXT:    ld.param.b64 %fd1, [fcopysign_d_d_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd2, [fcopysign_d_d_param_1];
 ; CHECK-NEXT:    copysign.f64 %fd3, %fd2, %fd1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd3;
 ; CHECK-NEXT:    ret;
   %val = call double @llvm.copysign.f64(double %a, double %b)
   ret double %val
@@ -43,15 +43,15 @@ define float @fcopysign_f_d(float %a, double %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [fcopysign_f_d_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [fcopysign_f_d_param_0];
 ; CHECK-NEXT:    abs.f32 %f2, %f1;
 ; CHECK-NEXT:    neg.f32 %f3, %f2;
-; CHECK-NEXT:    ld.param.u64 %rd1, [fcopysign_f_d_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [fcopysign_f_d_param_1];
 ; CHECK-NEXT:    shr.u64 %rd2, %rd1, 63;
 ; CHECK-NEXT:    and.b64 %rd3, %rd2, 1;
 ; CHECK-NEXT:    setp.ne.b64 %p1, %rd3, 0;
 ; CHECK-NEXT:    selp.f32 %f4, %f3, %f2, %p1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f4;
 ; CHECK-NEXT:    ret;
   %c = fptrunc double %b to float
   %val = call float @llvm.copysign.f32(float %a, float %c)
@@ -66,15 +66,15 @@ define float @fcopysign_f_h(float %a, half %b) {
 ; CHECK-NEXT:    .reg .b32 %f<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [fcopysign_f_h_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [fcopysign_f_h_param_0];
 ; CHECK-NEXT:    abs.f32 %f2, %f1;
 ; CHECK-NEXT:    neg.f32 %f3, %f2;
-; CHECK-NEXT:    ld.param.u16 %rs1, [fcopysign_f_h_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs1, [fcopysign_f_h_param_1];
 ; CHECK-NEXT:    shr.u16 %rs2, %rs1, 15;
 ; CHECK-NEXT:    and.b16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    setp.ne.b16 %p1, %rs3, 0;
 ; CHECK-NEXT:    selp.f32 %f4, %f3, %f2, %p1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f4;
 ; CHECK-NEXT:    ret;
   %c = fpext half %b to float
   %val = call float @llvm.copysign.f32(float %a, float %c)
@@ -89,15 +89,15 @@ define double @fcopysign_d_f(double %a, float %b) {
 ; CHECK-NEXT:    .reg .b64 %fd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [fcopysign_d_f_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [fcopysign_d_f_param_0];
 ; CHECK-NEXT:    abs.f64 %fd2, %fd1;
 ; CHECK-NEXT:    neg.f64 %fd3, %fd2;
-; CHECK-NEXT:    ld.param.u32 %r1, [fcopysign_d_f_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [fcopysign_d_f_param_1];
 ; CHECK-NEXT:    shr.u32 %r2, %r1, 31;
 ; CHECK-NEXT:    and.b32 %r3, %r2, 1;
 ; CHECK-NEXT:    setp.ne.b32 %p1, %r3, 0;
 ; CHECK-NEXT:    selp.f64 %fd4, %fd3, %fd2, %p1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd4;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd4;
 ; CHECK-NEXT:    ret;
   %c = fpext float %b to double
   %val = call double @llvm.copysign.f64(double %a, double %c)
@@ -112,15 +112,15 @@ define double @fcopysign_d_h(double %a, half %b) {
 ; CHECK-NEXT:    .reg .b64 %fd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [fcopysign_d_h_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [fcopysign_d_h_param_0];
 ; CHECK-NEXT:    abs.f64 %fd2, %fd1;
 ; CHECK-NEXT:    neg.f64 %fd3, %fd2;
-; CHECK-NEXT:    ld.param.u16 %rs1, [fcopysign_d_h_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs1, [fcopysign_d_h_param_1];
 ; CHECK-NEXT:    shr.u16 %rs2, %rs1, 15;
 ; CHECK-NEXT:    and.b16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    setp.ne.b16 %p1, %rs3, 0;
 ; CHECK-NEXT:    selp.f64 %fd4, %fd3, %fd2, %p1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd4;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd4;
 ; CHECK-NEXT:    ret;
   %c = fpext half %b to double
   %val = call double @llvm.copysign.f64(double %a, double %c)
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
index 530a896642b89..b27f3078300b1 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
@@ -25,14 +25,14 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_0];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -44,14 +44,14 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_0];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -74,15 +74,15 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_0];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -94,15 +94,15 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_0];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -125,16 +125,16 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_0];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_3d_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -146,16 +146,16 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_0];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -178,17 +178,17 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_0];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_4d_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_3];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -200,17 +200,17 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_0];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_3];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -233,18 +233,18 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_0];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_5d_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_3];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_4];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -256,18 +256,18 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_0];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_3];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_4];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -290,17 +290,17 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_3d_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_3];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2col_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4;
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -312,17 +312,17 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_3];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -345,19 +345,19 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_4d_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_3];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_4];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_5];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2col_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -369,19 +369,19 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_3];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_4];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_4d_param_5];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -404,21 +404,21 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_5d_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_3];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_4];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_5];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_6];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_7];
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2col_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4;
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -430,21 +430,21 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_3];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_4];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_5];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_im2col_5d_param_6];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r7, [cp_async_bulk_tensor_g2s_im2col_5d_param_7];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2col_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll
index 262df0777a201..c32c5559b1591 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll
@@ -22,10 +22,10 @@ define void @cp_async_bulk_tensor_prefetch_tile_1d(ptr %tmap, i32 %d0, i64 %ch)
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0:
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_tile_1d_param_0];
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_tile_1d_param_1];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_tile_1d_param_0];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_tile_1d_param_1];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.1d.L2.global.tile [%rd1, {%r1}];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_tile_1d_param_2];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_tile_1d_param_2];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.1d.L2.global.tile.L2::cache_hint [%rd1, {%r1}], %rd2;
 ; CHECK-PTX-NEXT:    ret;
   tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tmap, i32 %d0, i64 undef, i1 0)
@@ -41,11 +41,11 @@ define void @cp_async_bulk_tensor_prefetch_tile_2d(i32 %flag, ptr %tmap, i32 %d0
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0:
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_tile_2d_param_1];
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_tile_2d_param_2];
-; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_tile_2d_param_3];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_tile_2d_param_1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_tile_2d_param_2];
+; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_tile_2d_param_3];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.2d.L2.global.tile [%rd1, {%r1, %r2}];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_tile_2d_param_4];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_tile_2d_param_4];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.2d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2}], %rd2;
 ; CHECK-PTX-NEXT:    ret;
   tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
@@ -62,14 +62,14 @@ define void @cp_async_bulk_tensor_prefetch_3d(i32 %flag, ptr %tmap, i32 %d0, i32
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0:
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_3d_param_1];
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_3d_param_2];
-; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_3d_param_3];
-; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_3d_param_4];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_3d_param_1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_3d_param_2];
+; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_3d_param_3];
+; CHECK-PTX-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_3d_param_4];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.tile [%rd1, {%r1, %r2, %r3}];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_3d_param_6];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_3d_param_6];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3}], %rd2;
-; CHECK-PTX-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_3d_param_5];
+; CHECK-PTX-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_3d_param_5];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.im2col [%rd1, {%r1, %r2, %r3}], {%rs1};
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.3d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1}, %rd2;
 ; CHECK-PTX-NEXT:    ret;
@@ -90,16 +90,16 @@ define void @cp_async_bulk_tensor_prefetch_4d(i32 %flag, ptr %tmap, i32 %d0, i32
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0:
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_4d_param_1];
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_4d_param_2];
-; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_4d_param_3];
-; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_4d_param_4];
-; CHECK-PTX-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_prefetch_4d_param_5];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_4d_param_1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_4d_param_2];
+; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_4d_param_3];
+; CHECK-PTX-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_4d_param_4];
+; CHECK-PTX-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_prefetch_4d_param_5];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4}];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_4d_param_8];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_4d_param_8];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], %rd2;
-; CHECK-PTX-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_4d_param_6];
-; CHECK-PTX-NEXT:    ld.param.u16 %rs2, [cp_async_bulk_tensor_prefetch_4d_param_7];
+; CHECK-PTX-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_4d_param_6];
+; CHECK-PTX-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_prefetch_4d_param_7];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.im2col [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2};
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.4d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2;
 ; CHECK-PTX-NEXT:    ret;
@@ -120,18 +120,18 @@ define void @cp_async_bulk_tensor_prefetch_5d(i32 %flag, ptr %tmap, i32 %d0, i32
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0:
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_5d_param_1];
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_5d_param_2];
-; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_5d_param_3];
-; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_5d_param_4];
-; CHECK-PTX-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_prefetch_5d_param_5];
-; CHECK-PTX-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_prefetch_5d_param_6];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_5d_param_1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_5d_param_2];
+; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_5d_param_3];
+; CHECK-PTX-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_5d_param_4];
+; CHECK-PTX-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_prefetch_5d_param_5];
+; CHECK-PTX-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_prefetch_5d_param_6];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4, %r5}];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_5d_param_10];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_5d_param_10];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], %rd2;
-; CHECK-PTX-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_5d_param_7];
-; CHECK-PTX-NEXT:    ld.param.u16 %rs2, [cp_async_bulk_tensor_prefetch_5d_param_8];
-; CHECK-PTX-NEXT:    ld.param.u16 %rs3, [cp_async_bulk_tensor_prefetch_5d_param_9];
+; CHECK-PTX-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_5d_param_7];
+; CHECK-PTX-NEXT:    ld.param.b16 %rs2, [cp_async_bulk_tensor_prefetch_5d_param_8];
+; CHECK-PTX-NEXT:    ld.param.b16 %rs3, [cp_async_bulk_tensor_prefetch_5d_param_9];
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.im2col [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2, %rs3};
 ; CHECK-PTX-NEXT:    cp.async.bulk.prefetch.tensor.5d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2, %rs3}, %rd2;
 ; CHECK-PTX-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll
index b0c39de17811a..b73631d219ba1 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll
@@ -22,10 +22,10 @@ define void @cp_async_bulk_tensor_reduce_tile_1d(ptr addrspace(3) %src, ptr %tma
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0:
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_1d_param_0];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_1d_param_1];
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_1d_param_2];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_1d_param_3];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_1d_param_0];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_1d_param_1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_1d_param_2];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_1d_param_3];
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
@@ -71,11 +71,11 @@ define void @cp_async_bulk_tensor_reduce_tile_2d(ptr addrspace(3) %src, ptr %tma
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0:
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_2d_param_0];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_2d_param_1];
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_2d_param_2];
-; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_2d_param_3];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_2d_param_4];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_2d_param_0];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_2d_param_1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_2d_param_2];
+; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_tile_2d_param_3];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_2d_param_4];
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
@@ -121,12 +121,12 @@ define void @cp_async_bulk_tensor_reduce_tile_3d(ptr addrspace(3) %src, ptr %tma
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0:
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_3d_param_0];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_3d_param_1];
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_3d_param_2];
-; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_3d_param_3];
-; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_3d_param_4];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_3d_param_5];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_3d_param_0];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_3d_param_1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_3d_param_2];
+; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_tile_3d_param_3];
+; CHECK-PTX-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_tile_3d_param_4];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_3d_param_5];
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
@@ -172,13 +172,13 @@ define void @cp_async_bulk_tensor_reduce_tile_4d(ptr addrspace(3) %src, ptr %tma
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0:
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_4d_param_0];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_4d_param_1];
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_4d_param_2];
-; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_4d_param_3];
-; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_4d_param_4];
-; CHECK-PTX-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_4d_param_5];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_4d_param_6];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_4d_param_0];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_4d_param_1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_4d_param_2];
+; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_tile_4d_param_3];
+; CHECK-PTX-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_tile_4d_param_4];
+; CHECK-PTX-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_reduce_tile_4d_param_5];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_4d_param_6];
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
@@ -224,14 +224,14 @@ define void @cp_async_bulk_tensor_reduce_tile_5d(ptr addrspace(3) %src, ptr %tma
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0:
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_5d_param_0];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_5d_param_1];
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_5d_param_2];
-; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_5d_param_3];
-; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_5d_param_4];
-; CHECK-PTX-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_5d_param_5];
-; CHECK-PTX-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_tile_5d_param_6];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_5d_param_7];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_5d_param_0];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_5d_param_1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_5d_param_2];
+; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_tile_5d_param_3];
+; CHECK-PTX-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_tile_5d_param_4];
+; CHECK-PTX-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_reduce_tile_5d_param_5];
+; CHECK-PTX-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_reduce_tile_5d_param_6];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_5d_param_7];
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
@@ -277,12 +277,12 @@ define void @cp_async_bulk_tensor_reduce_im2col_3d(ptr addrspace(3) %src, ptr %t
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0:
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_3d_param_0];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_3d_param_1];
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_3d_param_2];
-; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_3d_param_3];
-; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_3d_param_4];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_3d_param_5];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_im2col_3d_param_0];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_im2col_3d_param_1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_im2col_3d_param_2];
+; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_im2col_3d_param_3];
+; CHECK-PTX-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_im2col_3d_param_4];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_im2col_3d_param_5];
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
@@ -328,13 +328,13 @@ define void @cp_async_bulk_tensor_reduce_im2col_4d(ptr addrspace(3) %src, ptr %t
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0:
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_4d_param_0];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_4d_param_1];
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_4d_param_2];
-; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_4d_param_3];
-; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_4d_param_4];
-; CHECK-PTX-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_4d_param_5];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_4d_param_6];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_im2col_4d_param_0];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_im2col_4d_param_1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_im2col_4d_param_2];
+; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_im2col_4d_param_3];
+; CHECK-PTX-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_im2col_4d_param_4];
+; CHECK-PTX-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_reduce_im2col_4d_param_5];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_im2col_4d_param_6];
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
@@ -380,14 +380,14 @@ define void @cp_async_bulk_tensor_reduce_im2col_5d(ptr addrspace(3) %src, ptr %t
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0:
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_5d_param_0];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_5d_param_1];
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_5d_param_2];
-; CHECK-PTX-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_5d_param_3];
-; CHECK-PTX-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_5d_param_4];
-; CHECK-PTX-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_5d_param_5];
-; CHECK-PTX-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_im2col_5d_param_6];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_5d_param_7];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_im2col_5d_param_0];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_im2col_5d_param_1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_im2col_5d_param_2];
+; CHECK-PTX-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_im2col_5d_param_3];
+; CHECK-PTX-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_im2col_5d_param_4];
+; CHECK-PTX-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_reduce_im2col_5d_param_5];
+; CHECK-PTX-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_reduce_im2col_5d_param_6];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_im2col_5d_param_7];
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
 ; CHECK-PTX-NEXT:    cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll
index de3d9ddaac9c3..6a366f658c777 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll
@@ -24,11 +24,11 @@ define void @cp_async_bulk_tensor_s2g_tile_1d(ptr addrspace(3) %src, ptr %tmap,
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_0];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_2];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd2, {%r1}], [%rd1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_tile_1d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_1d_param_3];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
 ; CHECK-PTX64-NEXT:    ret;
 ;
@@ -38,11 +38,11 @@ define void @cp_async_bulk_tensor_s2g_tile_1d(ptr addrspace(3) %src, ptr %tmap,
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_0];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_tile_1d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_1d_param_2];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd1, {%r2}], [%r1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_3];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2}], [%r1], %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
   tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
@@ -58,12 +58,12 @@ define void @cp_async_bulk_tensor_s2g_tile_2d(i32 %flag, ptr addrspace(3) %src,
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_3];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_4];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_tile_2d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_2d_param_5];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
 ; CHECK-PTX64-NEXT:    ret;
 ;
@@ -73,12 +73,12 @@ define void @cp_async_bulk_tensor_s2g_tile_2d(i32 %flag, ptr addrspace(3) %src,
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_3];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_tile_2d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_2d_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3}], [%r1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3}], [%r1], %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
   tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
@@ -94,13 +94,13 @@ define void @cp_async_bulk_tensor_s2g_3d(i32 %flag, ptr addrspace(3) %src, ptr %
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_3d_param_3];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_3d_param_4];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_3d_param_5];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_3d_param_6];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
@@ -112,13 +112,13 @@ define void @cp_async_bulk_tensor_s2g_3d(i32 %flag, ptr addrspace(3) %src, ptr %
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_3d_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_3d_param_3];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_3d_param_4];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_3d_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2;
@@ -139,14 +139,14 @@ define void @cp_async_bulk_tensor_s2g_4d(i32 %flag, ptr addrspace(3) %src, ptr %
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_4d_param_3];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_4d_param_4];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_4d_param_5];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_4d_param_6];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_4d_param_7];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
@@ -158,14 +158,14 @@ define void @cp_async_bulk_tensor_s2g_4d(i32 %flag, ptr addrspace(3) %src, ptr %
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_4d_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_4d_param_3];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_4d_param_4];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_4d_param_5];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_4d_param_6];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2;
@@ -186,15 +186,15 @@ define void @cp_async_bulk_tensor_s2g_5d(i32 %flag, ptr addrspace(3) %src, ptr %
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_5d_param_3];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_5d_param_4];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_5d_param_5];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_5d_param_6];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_5d_param_7];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_5d_param_8];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
@@ -206,15 +206,15 @@ define void @cp_async_bulk_tensor_s2g_5d(i32 %flag, ptr addrspace(3) %src, ptr %
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_5d_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_5d_param_3];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_5d_param_4];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_5d_param_5];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_5d_param_6];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r6, [cp_async_bulk_tensor_s2g_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [cp_async_bulk_tensor_s2g_5d_param_7];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_8];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2;
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
index bf1b86e37ae72..77694ac82459a 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
@@ -19,14 +19,14 @@ define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_g2s_param_0];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_g2s_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_g2s_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_g2s_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_g2s_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_g2s_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_g2s_param_3];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd4, [cp_async_bulk_g2s_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [cp_async_bulk_g2s_param_5];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4;
-; CHECK-PTX64-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_g2s_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%rd3], [%rd1], %r1, [%rd2], %rs1;
 ; CHECK-PTX64-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rs1, %rd4;
 ; CHECK-PTX64-NEXT:    ret;
@@ -38,14 +38,14 @@ define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_g2s_param_0];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_g2s_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_g2s_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_g2s_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_g2s_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_g2s_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_g2s_param_3];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_g2s_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_g2s_param_5];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u16 %rs1, [cp_async_bulk_g2s_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%r2], [%rd1], %r3, [%r1], %rs1;
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rs1, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
@@ -63,11 +63,11 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_s2g_param_0];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_s2g_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_s2g_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_s2g_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_s2g_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_s2g_param_2];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1;
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_s2g_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_s2g_param_3];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd2], [%rd1], %r1, %rd3;
 ; CHECK-PTX64-NEXT:    ret;
 ;
@@ -77,11 +77,11 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_s2g_param_0];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_s2g_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_s2g_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_s2g_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_s2g_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_s2g_param_2];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2;
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_s2g_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_s2g_param_3];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd1], [%r1], %r2, %rd2;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
   tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 0, i1 0)
@@ -96,10 +96,10 @@ define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_cta_to_cluster_param_0];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_cta_to_cluster_param_1];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd3, [cp_async_bulk_cta_to_cluster_param_2];
-; CHECK-PTX64-NEXT:    ld.param.u32 %r1, [cp_async_bulk_cta_to_cluster_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_cta_to_cluster_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_cta_to_cluster_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [cp_async_bulk_cta_to_cluster_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [cp_async_bulk_cta_to_cluster_param_3];
 ; CHECK-PTX64-NEXT:    cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2];
 ; CHECK-PTX64-NEXT:    ret;
 ;
@@ -108,10 +108,10 @@ define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<5>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [cp_async_bulk_cta_to_cluster_param_0];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r2, [cp_async_bulk_cta_to_cluster_param_1];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r3, [cp_async_bulk_cta_to_cluster_param_2];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r4, [cp_async_bulk_cta_to_cluster_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [cp_async_bulk_cta_to_cluster_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [cp_async_bulk_cta_to_cluster_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [cp_async_bulk_cta_to_cluster_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [cp_async_bulk_cta_to_cluster_param_3];
 ; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%r3], [%r1], %r4, [%r2];
 ; CHECK-PTX-SHARED32-NEXT:    ret;
   tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src, i32 %size)
@@ -125,9 +125,9 @@ define void @cp_async_bulk_prefetch(ptr addrspace(1) %src, i32 %size, i64 %ch) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [cp_async_bulk_prefetch_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [cp_async_bulk_prefetch_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd2, [cp_async_bulk_prefetch_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd1, [cp_async_bulk_prefetch_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [cp_async_bulk_prefetch_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [cp_async_bulk_prefetch_param_2];
 ; CHECK-NEXT:    cp.async.bulk.prefetch.L2.global.L2::cache_hint [%rd1], %r1, %rd2;
 ; CHECK-NEXT:    cp.async.bulk.prefetch.L2.global [%rd1], %r1;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/ctlz.ll b/llvm/test/CodeGen/NVPTX/ctlz.ll
index 1443e5c46346c..1c3f69943803e 100644
--- a/llvm/test/CodeGen/NVPTX/ctlz.ll
+++ b/llvm/test/CodeGen/NVPTX/ctlz.ll
@@ -17,7 +17,7 @@ define i32 @myctlz(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [myctlz_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [myctlz_param_0];
 ; CHECK-NEXT:    clz.b32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -30,7 +30,7 @@ define i32 @myctlz_2(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [myctlz_2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [myctlz_2_param_0];
 ; CHECK-NEXT:    clz.b32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -47,7 +47,7 @@ define i64 @myctlz64(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [myctlz64_param_0];
 ; CHECK-NEXT:    clz.b64 %r1, %rd1;
 ; CHECK-NEXT:    cvt.u64.u32 %rd2, %r1;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
@@ -62,7 +62,7 @@ define i64 @myctlz64_2(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz64_2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [myctlz64_2_param_0];
 ; CHECK-NEXT:    clz.b64 %r1, %rd1;
 ; CHECK-NEXT:    cvt.u64.u32 %rd2, %r1;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
@@ -81,7 +81,7 @@ define i32 @myctlz64_as_32(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz64_as_32_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [myctlz64_as_32_param_0];
 ; CHECK-NEXT:    clz.b64 %r1, %rd1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -96,7 +96,7 @@ define i32 @myctlz64_as_32_2(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz64_as_32_2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [myctlz64_as_32_2_param_0];
 ; CHECK-NEXT:    clz.b64 %r1, %rd1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -115,7 +115,7 @@ define i16 @myctlz_ret16(i16 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %r1, [myctlz_ret16_param_0];
+; CHECK-NEXT:    ld.param.b16 %r1, [myctlz_ret16_param_0];
 ; CHECK-NEXT:    clz.b32 %r2, %r1;
 ; CHECK-NEXT:    add.s32 %r3, %r2, -16;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -129,7 +129,7 @@ define i16 @myctlz_ret16_2(i16 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %r1, [myctlz_ret16_2_param_0];
+; CHECK-NEXT:    ld.param.b16 %r1, [myctlz_ret16_2_param_0];
 ; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-NEXT:    clz.b32 %r3, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -147,11 +147,11 @@ define void @myctlz_store16(i16 %a, ptr %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %r1, [myctlz_store16_param_0];
+; CHECK-NEXT:    ld.param.b16 %r1, [myctlz_store16_param_0];
 ; CHECK-NEXT:    clz.b32 %r2, %r1;
 ; CHECK-NEXT:    add.s32 %r3, %r2, -16;
-; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz_store16_param_1];
-; CHECK-NEXT:    st.u16 [%rd1], %r3;
+; CHECK-NEXT:    ld.param.b64 %rd1, [myctlz_store16_param_1];
+; CHECK-NEXT:    st.b16 [%rd1], %r3;
 ; CHECK-NEXT:    ret;
   %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
   store i16 %val, ptr %b
@@ -164,11 +164,11 @@ define void @myctlz_store16_2(i16 %a, ptr %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %r1, [myctlz_store16_2_param_0];
+; CHECK-NEXT:    ld.param.b16 %r1, [myctlz_store16_2_param_0];
 ; CHECK-NEXT:    clz.b32 %r2, %r1;
 ; CHECK-NEXT:    add.s32 %r3, %r2, -16;
-; CHECK-NEXT:    ld.param.u64 %rd1, [myctlz_store16_2_param_1];
-; CHECK-NEXT:    st.u16 [%rd1], %r3;
+; CHECK-NEXT:    ld.param.b64 %rd1, [myctlz_store16_2_param_1];
+; CHECK-NEXT:    st.b16 [%rd1], %r3;
 ; CHECK-NEXT:    ret;
   %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
   store i16 %val, ptr %b
diff --git a/llvm/test/CodeGen/NVPTX/dag-cse.ll b/llvm/test/CodeGen/NVPTX/dag-cse.ll
index ff22c0bd747e4..84a38fd30963b 100644
--- a/llvm/test/CodeGen/NVPTX/dag-cse.ll
+++ b/llvm/test/CodeGen/NVPTX/dag-cse.ll
@@ -9,11 +9,11 @@
 ; Verify that loads with different memory types are not subject to CSE
 ; once they are promoted to the same type.
 ;
-; CHECK: ld.global.v2.u8  {%[[B1:rs[0-9]+]], %[[B2:rs[0-9]+]]}, [a];
-; CHECK: st.global.v2.u8  [b], {%[[B1]], %[[B2]]};
+; CHECK: ld.global.v2.b8  {%[[B1:rs[0-9]+]], %[[B2:rs[0-9]+]]}, [a];
+; CHECK: st.global.v2.b8  [b], {%[[B1]], %[[B2]]};
 ;
-; CHECK: ld.global.u32 %[[C:r[0-9]+]], [a];
-; CHECK: st.global.u32 [c], %[[C]];
+; CHECK: ld.global.b32 %[[C:r[0-9]+]], [a];
+; CHECK: st.global.b32 [c], %[[C]];
 
 define void @test1() #0 {
   %1 = load <2 x i8>, ptr addrspace(1) @a, align 8
diff --git a/llvm/test/CodeGen/NVPTX/demote-vars.ll b/llvm/test/CodeGen/NVPTX/demote-vars.ll
index 16ae80dca1edc..ab89b62b53d05 100644
--- a/llvm/test/CodeGen/NVPTX/demote-vars.ll
+++ b/llvm/test/CodeGen/NVPTX/demote-vars.ll
@@ -66,9 +66,9 @@ define void @define_private_global(i64 %val) {
 ;
 ; Also check that the if-then is still here, otherwise we may not be testing
 ; the "more-than-one-use" part.
-; CHECK: st.shared.u64   [private_global_used_more_than_once_in_same_fct],
+; CHECK: st.shared.b64   [private_global_used_more_than_once_in_same_fct],
 ; CHECK: mov.b64 %[[VAR:.*]], 25
-; CHECK: st.shared.u64   [private_global_used_more_than_once_in_same_fct], %[[VAR]]
+; CHECK: st.shared.b64   [private_global_used_more_than_once_in_same_fct], %[[VAR]]
 define void @define_private_global_more_than_one_use(i64 %val, i1 %cond) {
   store i64 %val, ptr addrspace(3) @private_global_used_more_than_once_in_same_fct
   br i1 %cond, label %then, label %end
diff --git a/llvm/test/CodeGen/NVPTX/discard.ll b/llvm/test/CodeGen/NVPTX/discard.ll
index 8e5c9bab97c8d..ce72f5f52b8a8 100644
--- a/llvm/test/CodeGen/NVPTX/discard.ll
+++ b/llvm/test/CodeGen/NVPTX/discard.ll
@@ -13,7 +13,7 @@ define void @discard_global_L2(ptr addrspace(1) %global_ptr) {
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [discard_global_L2_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [discard_global_L2_param_0];
 ; CHECK-PTX64-NEXT:    discard.global.L2 [%rd1], 128;
 ; CHECK-PTX64-NEXT:    ret;
   tail call void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 128)
@@ -26,7 +26,7 @@ define void @discard_L2(ptr %ptr) {
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [discard_L2_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [discard_L2_param_0];
 ; CHECK-PTX64-NEXT:    discard.L2 [%rd1], 128;
 ; CHECK-PTX64-NEXT:    ret;
   tail call void @llvm.nvvm.discard.L2(ptr %ptr, i64 128)
diff --git a/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll
index b0e2082621bff..a21261c768862 100644
--- a/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll
+++ b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll
@@ -14,7 +14,7 @@ define i32  @test_disjoint_or_addr(i16 %a) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    mov.b64 %rd1, a;
 ; CHECK-NEXT:    cvta.global.u64 %rd2, %rd1;
-; CHECK-NEXT:    ld.u32 %r1, [%rd2+8];
+; CHECK-NEXT:    ld.b32 %r1, [%rd2+8];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %a1 = ptrtoint ptr @a to i64
diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
index a233616563085..cea3ac37c1964 100644
--- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
+++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
@@ -18,17 +18,17 @@ define i32 @test_distributed_shared_cluster_common(ptr %ptr, ptr addrspace(3) %s
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_distributed_shared_cluster_common_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_distributed_shared_cluster_common_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_distributed_shared_cluster_common_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_distributed_shared_cluster_common_param_1];
 ; CHECK-NEXT:    mov.u32 %r1, %ctaid.x;
 ; CHECK-NEXT:    xor.b32 %r2, %r1, 1;
 ; CHECK-NEXT:    isspacep.shared::cluster %p1, %rd1;
 ; CHECK-NEXT:    mapa.u64 %rd3, %rd1, %r2;
 ; CHECK-NEXT:    isspacep.shared::cluster %p2, %rd3;
 ; CHECK-NEXT:    mapa.shared::cluster.u64 %rd4, %rd2, %r2;
-; CHECK-NEXT:    ld.shared::cluster.u32 %r3, [%rd4];
+; CHECK-NEXT:    ld.shared::cluster.b32 %r3, [%rd4];
 ; CHECK-NEXT:    add.s32 %r4, %r3, 42;
-; CHECK-NEXT:    st.shared::cluster.u32 [%rd4], %r4;
+; CHECK-NEXT:    st.shared::cluster.b32 [%rd4], %r4;
 ; CHECK-NEXT:    selp.b32 %r5, 1, 0, %p1;
 ; CHECK-NEXT:    selp.b32 %r6, 1, 0, %p2;
 ; CHECK-NEXT:    add.s32 %r7, %r5, %r6;
@@ -64,7 +64,7 @@ define void @test_distributed_shared_cluster_float_atomic(ptr addrspace(7) %dsme
 ; CHECK-NEXT:    .reg .b64 %fd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_distributed_shared_cluster_float_atomic_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_distributed_shared_cluster_float_atomic_param_0];
 ; CHECK-NEXT:    mov.b16 %rs1, 0x3C00;
 ; CHECK-NEXT:    atom.shared::cluster.add.noftz.f16 %rs2, [%rd1], %rs1;
 ; CHECK-NEXT:    mov.b16 %rs3, 0x3F80;
@@ -90,7 +90,7 @@ define void @test_distributed_shared_cluster_int_atomic(ptr addrspace(7) %dsmem_
 ; CHECK-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_distributed_shared_cluster_int_atomic_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_distributed_shared_cluster_int_atomic_param_0];
 ; CHECK-NEXT:    atom.shared::cluster.add.u32 %r1, [%rd1], 1;
 ; CHECK-NEXT:    atom.shared::cluster.add.u64 %rd2, [%rd1], 1;
 ; CHECK-NEXT:    atom.shared::cluster.exch.b32 %r2, [%rd1], 1;
@@ -142,7 +142,7 @@ define void @test_distributed_shared_cluster_bitwise_atomic(ptr addrspace(7) %ds
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_distributed_shared_cluster_bitwise_atomic_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_distributed_shared_cluster_bitwise_atomic_param_0];
 ; CHECK-NEXT:    atom.shared::cluster.and.b32 %r1, [%rd1], 1;
 ; CHECK-NEXT:    atom.shared::cluster.and.b64 %rd2, [%rd1], 1;
 ; CHECK-NEXT:    atom.shared::cluster.or.b32 %r2, [%rd1], 1;
@@ -171,7 +171,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:    .reg .b64 %rd<12>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0];
 ; CHECK-NEXT:    atom.relaxed.shared::cluster.cas.b32 %r24, [%rd2], 1, 0;
 ; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b32 %r25, [%rd2], 1, 0;
 ; CHECK-NEXT:    atom.acquire.shared::cluster.cas.b32 %r26, [%rd2], 1, 0;
@@ -205,7 +205,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:    not.b32 %r2, %r36;
 ; CHECK-NEXT:    mov.b32 %r37, 1;
 ; CHECK-NEXT:    shl.b32 %r3, %r37, %r1;
-; CHECK-NEXT:    ld.shared::cluster.u32 %r38, [%rd1];
+; CHECK-NEXT:    ld.shared::cluster.b32 %r38, [%rd1];
 ; CHECK-NEXT:    and.b32 %r48, %r38, %r2;
 ; CHECK-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop33
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -220,7 +220,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:    mov.b32 %r48, %r7;
 ; CHECK-NEXT:    @%p2 bra $L__BB4_1;
 ; CHECK-NEXT:  $L__BB4_3: // %partword.cmpxchg.end31
-; CHECK-NEXT:    ld.shared::cluster.u32 %r40, [%rd1];
+; CHECK-NEXT:    ld.shared::cluster.b32 %r40, [%rd1];
 ; CHECK-NEXT:    and.b32 %r49, %r40, %r2;
 ; CHECK-NEXT:  $L__BB4_4: // %partword.cmpxchg.loop23
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -237,7 +237,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:  $L__BB4_6: // %partword.cmpxchg.end21
 ; CHECK-NEXT:    fence.acq_rel.sys;
 ; CHECK-NEXT:    fence.acq_rel.sys;
-; CHECK-NEXT:    ld.shared::cluster.u32 %r42, [%rd1];
+; CHECK-NEXT:    ld.shared::cluster.b32 %r42, [%rd1];
 ; CHECK-NEXT:    and.b32 %r50, %r42, %r2;
 ; CHECK-NEXT:  $L__BB4_7: // %partword.cmpxchg.loop13
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -253,7 +253,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:    @%p6 bra $L__BB4_7;
 ; CHECK-NEXT:  $L__BB4_9: // %partword.cmpxchg.end11
 ; CHECK-NEXT:    fence.acq_rel.sys;
-; CHECK-NEXT:    ld.shared::cluster.u32 %r44, [%rd1];
+; CHECK-NEXT:    ld.shared::cluster.b32 %r44, [%rd1];
 ; CHECK-NEXT:    and.b32 %r51, %r44, %r2;
 ; CHECK-NEXT:  $L__BB4_10: // %partword.cmpxchg.loop3
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -270,7 +270,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
 ; CHECK-NEXT:  $L__BB4_12: // %partword.cmpxchg.end1
 ; CHECK-NEXT:    fence.acq_rel.sys;
 ; CHECK-NEXT:    fence.sc.sys;
-; CHECK-NEXT:    ld.shared::cluster.u32 %r46, [%rd1];
+; CHECK-NEXT:    ld.shared::cluster.b32 %r46, [%rd1];
 ; CHECK-NEXT:    and.b32 %r52, %r46, %r2;
 ; CHECK-NEXT:  $L__BB4_13: // %partword.cmpxchg.loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/NVPTX/div.ll b/llvm/test/CodeGen/NVPTX/div.ll
index f8711e3a83591..bd8d9a35eed46 100644
--- a/llvm/test/CodeGen/NVPTX/div.ll
+++ b/llvm/test/CodeGen/NVPTX/div.ll
@@ -8,15 +8,15 @@ define float @div_full(float %a, float %b) {
 ; CHECK-NEXT:    .reg .b32 %f<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [div_full_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [div_full_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [div_full_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [div_full_param_1];
 ; CHECK-NEXT:    div.full.f32 %f3, %f1, %f2;
 ; CHECK-NEXT:    mov.b32 %f4, 0f40400000;
 ; CHECK-NEXT:    div.full.f32 %f5, %f3, %f4;
 ; CHECK-NEXT:    div.full.ftz.f32 %f6, %f5, %f2;
 ; CHECK-NEXT:    mov.b32 %f7, 0f40800000;
 ; CHECK-NEXT:    div.full.ftz.f32 %f8, %f6, %f7;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f8;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f8;
 ; CHECK-NEXT:    ret;
   %1 = call float @llvm.nvvm.div.full(float %a, float %b)
   %2 = call float @llvm.nvvm.div.full(float %1, float 3.0)
diff --git a/llvm/test/CodeGen/NVPTX/dot-product.ll b/llvm/test/CodeGen/NVPTX/dot-product.ll
index 984b2bb0d27d0..6d634229b37a0 100644
--- a/llvm/test/CodeGen/NVPTX/dot-product.ll
+++ b/llvm/test/CodeGen/NVPTX/dot-product.ll
@@ -15,9 +15,9 @@ define i32 @test_dp4a_u32_u32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_dp4a_u32_u32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_dp4a_u32_u32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_dp4a_u32_u32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_dp4a_u32_u32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_dp4a_u32_u32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_dp4a_u32_u32_param_2];
 ; CHECK-NEXT:    dp4a.u32.u32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -31,7 +31,7 @@ define i32 @test_dp4a_u32imm_u32imm(i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_dp4a_u32imm_u32imm_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_dp4a_u32imm_u32imm_param_0];
 ; CHECK-NEXT:    mov.b32 %r2, 0;
 ; CHECK-NEXT:    dp4a.u32.u32 %r3, %r2, %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -46,9 +46,9 @@ define i32 @test_dp4a_u32_s32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_dp4a_u32_s32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_dp4a_u32_s32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_dp4a_u32_s32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_dp4a_u32_s32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_dp4a_u32_s32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_dp4a_u32_s32_param_2];
 ; CHECK-NEXT:    dp4a.u32.s32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -62,9 +62,9 @@ define i32 @test_dp4a_s32_u32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_dp4a_s32_u32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_dp4a_s32_u32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_dp4a_s32_u32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_dp4a_s32_u32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_dp4a_s32_u32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_dp4a_s32_u32_param_2];
 ; CHECK-NEXT:    dp4a.s32.u32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -78,9 +78,9 @@ define i32 @test_dp4a_s32_s32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_dp4a_s32_s32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_dp4a_s32_s32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_dp4a_s32_s32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_dp4a_s32_s32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_dp4a_s32_s32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_dp4a_s32_s32_param_2];
 ; CHECK-NEXT:    dp4a.s32.s32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -99,9 +99,9 @@ define i32 @test_dp2a_lo_u32_u32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_dp2a_lo_u32_u32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_dp2a_lo_u32_u32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_dp2a_lo_u32_u32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_dp2a_lo_u32_u32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_dp2a_lo_u32_u32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_dp2a_lo_u32_u32_param_2];
 ; CHECK-NEXT:    dp2a.lo.u32.u32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -115,9 +115,9 @@ define i32 @test_dp2a_lo_u32_s32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_dp2a_lo_u32_s32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_dp2a_lo_u32_s32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_dp2a_lo_u32_s32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_dp2a_lo_u32_s32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_dp2a_lo_u32_s32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_dp2a_lo_u32_s32_param_2];
 ; CHECK-NEXT:    dp2a.lo.u32.s32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -131,9 +131,9 @@ define i32 @test_dp2a_lo_s32_u32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_dp2a_lo_s32_u32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_dp2a_lo_s32_u32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_dp2a_lo_s32_u32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_dp2a_lo_s32_u32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_dp2a_lo_s32_u32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_dp2a_lo_s32_u32_param_2];
 ; CHECK-NEXT:    dp2a.lo.s32.u32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -147,9 +147,9 @@ define i32 @test_dp2a_lo_s32_s32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_dp2a_lo_s32_s32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_dp2a_lo_s32_s32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_dp2a_lo_s32_s32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_dp2a_lo_s32_s32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_dp2a_lo_s32_s32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_dp2a_lo_s32_s32_param_2];
 ; CHECK-NEXT:    dp2a.lo.s32.s32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -163,9 +163,9 @@ define i32 @test_dp2a_hi_u32_u32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_dp2a_hi_u32_u32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_dp2a_hi_u32_u32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_dp2a_hi_u32_u32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_dp2a_hi_u32_u32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_dp2a_hi_u32_u32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_dp2a_hi_u32_u32_param_2];
 ; CHECK-NEXT:    dp2a.hi.u32.u32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -179,9 +179,9 @@ define i32 @test_dp2a_hi_u32_s32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_dp2a_hi_u32_s32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_dp2a_hi_u32_s32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_dp2a_hi_u32_s32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_dp2a_hi_u32_s32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_dp2a_hi_u32_s32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_dp2a_hi_u32_s32_param_2];
 ; CHECK-NEXT:    dp2a.hi.u32.s32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -195,9 +195,9 @@ define i32 @test_dp2a_hi_s32_u32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_dp2a_hi_s32_u32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_dp2a_hi_s32_u32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_dp2a_hi_s32_u32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_dp2a_hi_s32_u32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_dp2a_hi_s32_u32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_dp2a_hi_s32_u32_param_2];
 ; CHECK-NEXT:    dp2a.hi.s32.u32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -211,9 +211,9 @@ define i32 @test_dp2a_hi_s32_s32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_dp2a_hi_s32_s32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_dp2a_hi_s32_s32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_dp2a_hi_s32_s32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_dp2a_hi_s32_s32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_dp2a_hi_s32_s32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_dp2a_hi_s32_s32_param_2];
 ; CHECK-NEXT:    dp2a.hi.s32.s32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll b/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll
index 5d46c74157abd..f70831cc97ae1 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll
@@ -9,17 +9,17 @@ define void @foo(i64 %a, ptr %p0, ptr %p1) {
 ; CHECK-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [foo_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [foo_param_0];
 ; CHECK-NEXT:    add.s64 %rd2, %rd1, 7;
 ; CHECK-NEXT:    and.b64 %rd3, %rd2, -8;
 ; CHECK-NEXT:    alloca.u64 %rd4, %rd3, 16;
 ; CHECK-NEXT:    cvta.local.u64 %rd4, %rd4;
-; CHECK-NEXT:    ld.param.u64 %rd5, [foo_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd5, [foo_param_1];
 ; CHECK-NEXT:    alloca.u64 %rd6, %rd3, 16;
 ; CHECK-NEXT:    cvta.local.u64 %rd6, %rd6;
-; CHECK-NEXT:    ld.param.u64 %rd7, [foo_param_2];
-; CHECK-NEXT:    st.u64 [%rd5], %rd4;
-; CHECK-NEXT:    st.u64 [%rd7], %rd6;
+; CHECK-NEXT:    ld.param.b64 %rd7, [foo_param_2];
+; CHECK-NEXT:    st.b64 [%rd5], %rd4;
+; CHECK-NEXT:    st.b64 [%rd7], %rd6;
 ; CHECK-NEXT:    ret;
   %b = alloca i8, i64 %a, align 16
   %c = alloca i8, i64 %a, align 16
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index de19d2983f343..664569e3c525c 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -11,7 +11,7 @@
 ; CHECK-LABEL: .visible .func  (.param .b32 func_retval0) test_dynamic_stackalloc(
 ; CHECK-NOT: __local_depot
 
-; CHECK-32:       ld.param.u32  %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
+; CHECK-32:       ld.param.b32  %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
 ; CHECK-32-NEXT:  add.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 7;
 ; CHECK-32-NEXT:  and.b32         %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8;
 ; CHECK-32-NEXT:  alloca.u32  %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
@@ -20,7 +20,7 @@
 ; CHECK-32-NEXT:  .param .b32 param0;
 ; CHECK-32-NEXT:  st.param.b32  [param0], %r[[ALLOCA]];
 
-; CHECK-64:       ld.param.u64  %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
+; CHECK-64:       ld.param.b64  %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
 ; CHECK-64-NEXT:  add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7;
 ; CHECK-64-NEXT:  and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8;
 ; CHECK-64-NEXT:  alloca.u64  %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16;
diff --git a/llvm/test/CodeGen/NVPTX/elect.ll b/llvm/test/CodeGen/NVPTX/elect.ll
index 34b40ccdcbcbe..93c30a9b00068 100644
--- a/llvm/test/CodeGen/NVPTX/elect.ll
+++ b/llvm/test/CodeGen/NVPTX/elect.ll
@@ -14,7 +14,7 @@ define {i32, i1} @elect_sync(i32 %mask) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [elect_sync_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [elect_sync_param_0];
 ; CHECK-NEXT:    elect.sync %r2|%p1, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p1;
@@ -51,7 +51,7 @@ define {i32, i1} @elect_sync_twice(i32 %mask) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [elect_sync_twice_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [elect_sync_twice_param_0];
 ; CHECK-NEXT:    elect.sync %r2|%p1, %r1;
 ; CHECK-NEXT:    elect.sync %r3|%p2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
diff --git a/llvm/test/CodeGen/NVPTX/extloadv.ll b/llvm/test/CodeGen/NVPTX/extloadv.ll
index c9d14efa3a00d..3d861e69128bf 100644
--- a/llvm/test/CodeGen/NVPTX/extloadv.ll
+++ b/llvm/test/CodeGen/NVPTX/extloadv.ll
@@ -4,7 +4,7 @@
 define void @foo(ptr nocapture readonly %x_value, ptr nocapture %output) #0 {
   %1 = load <4 x float>, ptr %x_value, align 16
   %2 = fpext <4 x float> %1 to <4 x double>
-; CHECK-NOT: ld.v2.f32 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}];
+; CHECK-NOT: ld.v2.b32 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}];
 ; CHECK:  cvt.f64.f32
 ; CHECK:  cvt.f64.f32
 ; CHECK:  cvt.f64.f32
diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll
index bf4a41ae46050..79d80e6f8fa84 100644
--- a/llvm/test/CodeGen/NVPTX/extractelement.ll
+++ b/llvm/test/CodeGen/NVPTX/extractelement.ll
@@ -11,7 +11,7 @@ define i16  @test_v2i8(i16 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [test_v2i8_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_v2i8_param_0];
 ; CHECK-NEXT:    cvt.s16.s8 %rs2, %rs1;
 ; CHECK-NEXT:    shr.s16 %rs3, %rs1, 8;
 ; CHECK-NEXT:    add.s16 %rs4, %rs2, %rs3;
@@ -36,8 +36,8 @@ define i1  @test_v2i8_load(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_v2i8_load_param_0];
-; CHECK-NEXT:    ld.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_v2i8_load_param_0];
+; CHECK-NEXT:    ld.v2.b8 {%rs1, %rs2}, [%rd1];
 ; CHECK-NEXT:    or.b16 %rs5, %rs1, %rs2;
 ; CHECK-NEXT:    and.b16 %rs6, %rs5, 255;
 ; CHECK-NEXT:    setp.eq.s16 %p1, %rs6, 0;
@@ -59,7 +59,7 @@ define i16  @test_v4i8(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_v4i8_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_v4i8_param_0];
 ; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs1, %r2;
 ; CHECK-NEXT:    bfe.s32 %r3, %r1, 8, 8;
@@ -95,7 +95,7 @@ define i32  @test_v4i8_s32(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_v4i8_s32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_v4i8_s32_param_0];
 ; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    bfe.s32 %r3, %r1, 8, 8;
 ; CHECK-NEXT:    bfe.s32 %r4, %r1, 16, 8;
@@ -126,7 +126,7 @@ define i32  @test_v4i8_u32(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_v4i8_u32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_v4i8_u32_param_0];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    bfe.u32 %r3, %r1, 8, 8;
 ; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
@@ -161,7 +161,7 @@ define i16  @test_v8i8(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_v8i8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_v8i8_param_0];
 ; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; }
 ; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
 ; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index bea9db03caf6e..23fab22057869 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -220,7 +220,7 @@ define half @test_frem(half %a, half %b) #0 {
 
 ; CHECK-LABEL: test_store(
 ; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_store_param_0];
-; CHECK-DAG:  ld.param.u64    %[[PTR:rd[0-9]+]], [test_store_param_1];
+; CHECK-DAG:  ld.param.b64    %[[PTR:rd[0-9]+]], [test_store_param_1];
 ; CHECK-NEXT: st.b16          [%[[PTR]]], [[A]];
 ; CHECK-NEXT: ret;
 define void @test_store(half %a, ptr %b) #0 {
@@ -229,7 +229,7 @@ define void @test_store(half %a, ptr %b) #0 {
 }
 
 ; CHECK-LABEL: test_load(
-; CHECK:      ld.param.u64    %[[PTR:rd[0-9]+]], [test_load_param_0];
+; CHECK:      ld.param.b64    %[[PTR:rd[0-9]+]], [test_load_param_0];
 ; CHECK-NEXT: ld.b16          [[R:%rs[0-9]+]], [%[[PTR]]];
 ; CHECK-NEXT: st.param.b16    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
@@ -239,12 +239,12 @@ define half @test_load(ptr %a) #0 {
 }
 
 ; CHECK-LABEL: .visible .func test_halfp0a1(
-; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0];
-; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1];
-; CHECK-DAG: ld.u8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
-; CHECK-DAG: st.u8        [%[[TO]]], [[B0]]
-; CHECK-DAG: ld.u8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
-; CHECK-DAG: st.u8        [%[[TO]]+1], [[B1]]
+; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0];
+; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1];
+; CHECK-DAG: ld.b8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
+; CHECK-DAG: st.b8        [%[[TO]]], [[B0]]
+; CHECK-DAG: ld.b8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
+; CHECK-DAG: st.b8        [%[[TO]]+1], [[B1]]
 ; CHECK: ret
 define void @test_halfp0a1(ptr noalias readonly %from, ptr %to) {
   %1 = load half, ptr %from , align 1
@@ -357,8 +357,8 @@ define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
 }
 
 ; CHECK-LABEL: test_select_cc_f32_f16(
-; CHECK-DAG:  ld.param.f32    [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0];
-; CHECK-DAG:  ld.param.f32    [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1];
+; CHECK-DAG:  ld.param.b32    [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1];
 ; CHECK-DAG:  ld.param.b16    [[C:%rs[0-9]+]], [test_select_cc_f32_f16_param_2];
 ; CHECK-DAG:  ld.param.b16    [[D:%rs[0-9]+]], [test_select_cc_f32_f16_param_3];
 ; CHECK-F16-NOFTZ:  setp.neu.f16    [[PRED:%p[0-9]+]], [[C]], [[D]]
@@ -367,7 +367,7 @@ define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
 ; CHECK-NOF16: setp.neu.f32    [[PRED:%p[0-9]+]], [[CF]], [[DF]]
 ; CHECK-NEXT: selp.f32        [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]];
-; CHECK-NEXT: st.param.f32    [func_retval0], [[R]];
+; CHECK-NEXT: st.param.b32    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
 define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
   %cc = fcmp une half %c, %d
@@ -377,8 +377,8 @@ define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
 
 ; CHECK-LABEL: test_select_cc_f16_f32(
 ; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_select_cc_f16_f32_param_0];
-; CHECK-DAG:  ld.param.f32    [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2];
-; CHECK-DAG:  ld.param.f32    [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3];
+; CHECK-DAG:  ld.param.b32    [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2];
+; CHECK-DAG:  ld.param.b32    [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3];
 ; CHECK-NOFTZ-DAG:  setp.neu.f32    [[PRED:%p[0-9]+]], [[C]], [[D]]
 ; CHECK-F16-FTZ-DAG:  setp.neu.ftz.f32    [[PRED:%p[0-9]+]], [[C]], [[D]]
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_select_cc_f16_f32_param_1];
@@ -619,17 +619,17 @@ define i1 @test_fcmp_ord(half %a, half %b) #0 {
 ; CHECK-LABEL: test_br_cc(
 ; CHECK-DAG:  ld.param.b16    [[A:%rs[0-9]+]], [test_br_cc_param_0];
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_br_cc_param_1];
-; CHECK-DAG:  ld.param.u64    %[[C:rd[0-9]+]], [test_br_cc_param_2];
-; CHECK-DAG:  ld.param.u64    %[[D:rd[0-9]+]], [test_br_cc_param_3];
+; CHECK-DAG:  ld.param.b64    %[[C:rd[0-9]+]], [test_br_cc_param_2];
+; CHECK-DAG:  ld.param.b64    %[[D:rd[0-9]+]], [test_br_cc_param_3];
 ; CHECK-F16-NOFTZ:  setp.lt.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-F16-FTZ:  setp.lt.ftz.f16     [[PRED:%p[0-9]+]], [[A]], [[B]]
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.lt.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
 ; CHECK-NEXT: @[[PRED]] bra   [[LABEL:\$L__BB.*]];
-; CHECK:      st.u32  [%[[C]]],
+; CHECK:      st.b32  [%[[C]]],
 ; CHECK:      [[LABEL]]:
-; CHECK:      st.u32  [%[[D]]],
+; CHECK:      st.b32  [%[[D]]],
 ; CHECK:      ret;
 define void @test_br_cc(half %a, half %b, ptr %p1, ptr %p2) #0 {
   %c = fcmp uge half %a, %b
@@ -643,7 +643,7 @@ else:
 }
 
 ; CHECK-LABEL: test_phi(
-; CHECK:      ld.param.u64    %[[P1:rd[0-9]+]], [test_phi_param_0];
+; CHECK:      ld.param.b64    %[[P1:rd[0-9]+]], [test_phi_param_0];
 ; CHECK:      ld.b16  {{%rs[0-9]+}}, [%[[P1]]];
 ; CHECK: [[LOOP:\$L__BB[0-9_]+]]:
 ; CHECK:      mov.b16 [[R:%rs[0-9]+]], [[AB:%rs[0-9]+]];
@@ -712,7 +712,7 @@ define i64 @test_fptoui_i64(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_uitofp_i32(
-; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_uitofp_i32_param_0];
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_uitofp_i32_param_0];
 ; CHECK:      cvt.rn.f16.u32  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -722,7 +722,7 @@ define half @test_uitofp_i32(i32 %a) #0 {
 }
 
 ; CHECK-LABEL: test_uitofp_i64(
-; CHECK:      ld.param.u64    [[A:%rd[0-9]+]], [test_uitofp_i64_param_0];
+; CHECK:      ld.param.b64    [[A:%rd[0-9]+]], [test_uitofp_i64_param_0];
 ; CHECK:      cvt.rn.f16.u64  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -732,7 +732,7 @@ define half @test_uitofp_i64(i64 %a) #0 {
 }
 
 ; CHECK-LABEL: test_sitofp_i32(
-; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_sitofp_i32_param_0];
+; CHECK:      ld.param.b32    [[A:%r[0-9]+]], [test_sitofp_i32_param_0];
 ; CHECK:      cvt.rn.f16.s32  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -742,7 +742,7 @@ define half @test_sitofp_i32(i32 %a) #0 {
 }
 
 ; CHECK-LABEL: test_sitofp_i64(
-; CHECK:      ld.param.u64    [[A:%rd[0-9]+]], [test_sitofp_i64_param_0];
+; CHECK:      ld.param.b64    [[A:%rd[0-9]+]], [test_sitofp_i64_param_0];
 ; CHECK:      cvt.rn.f16.s64  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -752,7 +752,7 @@ define half @test_sitofp_i64(i64 %a) #0 {
 }
 
 ; CHECK-LABEL: test_uitofp_i32_fadd(
-; CHECK-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0];
 ; CHECK-DAG:  cvt.rn.f16.u32  [[C:%rs[0-9]+]], [[A]];
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_uitofp_i32_fadd_param_1];
 ; CHECK-F16-NOFTZ:       add.rn.f16      [[R:%rs[0-9]+]], [[B]], [[C]];
@@ -770,7 +770,7 @@ define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_sitofp_i32_fadd(
-; CHECK-DAG:  ld.param.u32    [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0];
+; CHECK-DAG:  ld.param.b32    [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0];
 ; CHECK-DAG:  cvt.rn.f16.s32  [[C:%rs[0-9]+]], [[A]];
 ; CHECK-DAG:  ld.param.b16    [[B:%rs[0-9]+]], [test_sitofp_i32_fadd_param_1];
 ; CHECK-F16-NOFTZ:         add.rn.f16     [[R:%rs[0-9]+]], [[B]], [[C]];
@@ -788,7 +788,7 @@ define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_fptrunc_float(
-; CHECK:      ld.param.f32    [[A:%f[0-9]+]], [test_fptrunc_float_param_0];
+; CHECK:      ld.param.b32    [[A:%f[0-9]+]], [test_fptrunc_float_param_0];
 ; CHECK:      cvt.rn.f16.f32  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -798,7 +798,7 @@ define half @test_fptrunc_float(float %a) #0 {
 }
 
 ; CHECK-LABEL: test_fptrunc_double(
-; CHECK:      ld.param.f64    [[A:%fd[0-9]+]], [test_fptrunc_double_param_0];
+; CHECK:      ld.param.b64    [[A:%fd[0-9]+]], [test_fptrunc_double_param_0];
 ; CHECK:      cvt.rn.f16.f64  [[R:%rs[0-9]+]], [[A]];
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
@@ -811,7 +811,7 @@ define half @test_fptrunc_double(double %a) #0 {
 ; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fpext_float_param_0];
 ; CHECK-NOFTZ:      cvt.f32.f16     [[R:%f[0-9]+]], [[A]];
 ; CHECK-F16-FTZ:      cvt.ftz.f32.f16     [[R:%f[0-9]+]], [[A]];
-; CHECK:      st.param.f32    [func_retval0], [[R]];
+; CHECK:      st.param.b32    [func_retval0], [[R]];
 ; CHECK:      ret;
 define float @test_fpext_float(half %a) #0 {
   %r = fpext half %a to float
@@ -821,7 +821,7 @@ define float @test_fpext_float(half %a) #0 {
 ; CHECK-LABEL: test_fpext_double(
 ; CHECK:      ld.param.b16    [[A:%rs[0-9]+]], [test_fpext_double_param_0];
 ; CHECK:      cvt.f64.f16     [[R:%fd[0-9]+]], [[A]];
-; CHECK:      st.param.f64    [func_retval0], [[R]];
+; CHECK:      st.param.b64    [func_retval0], [[R]];
 ; CHECK:      ret;
 define double @test_fpext_double(half %a) #0 {
   %r = fpext half %a to double
@@ -840,7 +840,7 @@ define i16 @test_bitcast_halftoi16(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_bitcast_i16tohalf(
-; CHECK:      ld.param.u16    [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0];
+; CHECK:      ld.param.b16    [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0];
 ; CHECK:      st.param.b16    [func_retval0], [[AS]];
 ; CHECK:      ret;
 define half @test_bitcast_i16tohalf(i16 %a) #0 {
@@ -1043,7 +1043,7 @@ define half @test_copysign(half %a, half %b) #0 {
 
 ; CHECK-LABEL: test_copysign_f32(
 ; CHECK-DAG:  ld.param.b16    [[AH:%rs[0-9]+]], [test_copysign_f32_param_0];
-; CHECK-DAG:  ld.param.f32    [[BF:%f[0-9]+]], [test_copysign_f32_param_1];
+; CHECK-DAG:  ld.param.b32    [[BF:%f[0-9]+]], [test_copysign_f32_param_1];
 ; CHECK-DAG:  mov.b32         [[B:%r[0-9]+]], [[BF]];
 ; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AH]], 32767;
 ; CHECK-DAG:  and.b32         [[BX0:%r[0-9]+]], [[B]], -2147483648;
@@ -1059,7 +1059,7 @@ define half @test_copysign_f32(half %a, float %b) #0 {
 
 ; CHECK-LABEL: test_copysign_f64(
 ; CHECK-DAG:  ld.param.b16    [[AH:%rs[0-9]+]], [test_copysign_f64_param_0];
-; CHECK-DAG:  ld.param.f64    [[BD:%fd[0-9]+]], [test_copysign_f64_param_1];
+; CHECK-DAG:  ld.param.b64    [[BD:%fd[0-9]+]], [test_copysign_f64_param_1];
 ; CHECK-DAG:  mov.b64         [[B:%rd[0-9]+]], [[BD]];
 ; CHECK-DAG:  and.b16         [[AX:%rs[0-9]+]], [[AH]], 32767;
 ; CHECK-DAG:  and.b64         [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808;
@@ -1082,7 +1082,7 @@ define half @test_copysign_f64(half %a, double %b) #0 {
 ; CHECK:      or.b16          [[RX:%rs[0-9]+]], [[AX]], [[BX]];
 ; CHECK-NOFTZ: cvt.f32.f16     [[XR:%f[0-9]+]], [[RX]];
 ; CHECK-F16-FTZ:   cvt.ftz.f32.f16 [[XR:%f[0-9]+]], [[RX]];
-; CHECK:      st.param.f32    [func_retval0], [[XR]];
+; CHECK:      st.param.b32    [func_retval0], [[XR]];
 ; CHECK:      ret;
 define float @test_copysign_extended(half %a, half %b) #0 {
   %r = call half @llvm.copysign.f16(half %a, half %b)
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index e854e5a6e5aaa..7fef947a0e599 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -82,7 +82,7 @@ define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_extract_i_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
 ; CHECK-NEXT:    setp.eq.s64 %p1, %rd1, 0;
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
@@ -390,8 +390,8 @@ define void @test_ldst_v2f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v2f16_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v2f16_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v2f16_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v2f16_param_0];
 ; CHECK-NEXT:    ld.b32 %r1, [%rd1];
 ; CHECK-NEXT:    st.b32 [%rd2], %r1;
 ; CHECK-NEXT:    ret;
@@ -412,11 +412,11 @@ define void @test_ldst_v3f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v3f16_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v3f16_param_0];
-; CHECK-NEXT:    ld.u64 %rd3, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v3f16_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v3f16_param_0];
+; CHECK-NEXT:    ld.b64 %rd3, [%rd1];
 ; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd3; }
-; CHECK-NEXT:    st.u32 [%rd2], %rd3;
+; CHECK-NEXT:    st.b32 [%rd2], %rd3;
 ; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
 ; CHECK-NEXT:    st.b16 [%rd2+4], %rs1;
 ; CHECK-NEXT:    ret;
@@ -432,8 +432,8 @@ define void @test_ldst_v4f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v4f16_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v4f16_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v4f16_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v4f16_param_0];
 ; CHECK-NEXT:    ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
 ; CHECK-NEXT:    st.v4.b16 [%rd2], {%rs1, %rs2, %rs3, %rs4};
 ; CHECK-NEXT:    ret;
@@ -449,8 +449,8 @@ define void @test_ldst_v8f16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v8f16_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v8f16_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v8f16_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v8f16_param_0];
 ; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
 ; CHECK-NEXT:    ret;
@@ -553,7 +553,7 @@ define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [test_select_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_select_param_2];
 ; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_select_param_1];
@@ -626,14 +626,14 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
 ; CHECK-F16-NEXT:    .reg .b32 %f<7>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
-; CHECK-F16-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
+; CHECK-F16-NEXT:    ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
+; CHECK-F16-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
 ; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
 ; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
 ; CHECK-F16-NEXT:    setp.neu.f16x2 %p1|%p2, %r1, %r2;
 ; CHECK-F16-NEXT:    selp.f32 %f5, %f2, %f4, %p2;
 ; CHECK-F16-NEXT:    selp.f32 %f6, %f1, %f3, %p1;
-; CHECK-F16-NEXT:    st.param.v2.f32 [func_retval0], {%f6, %f5};
+; CHECK-F16-NEXT:    st.param.v2.b32 [func_retval0], {%f6, %f5};
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-NOF16-LABEL: test_select_cc_f32_f16(
@@ -644,8 +644,8 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
 ; CHECK-NOF16-NEXT:    .reg .b32 %f<11>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
-; CHECK-NOF16-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
+; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
@@ -658,7 +658,7 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
 ; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %f8, %f7;
 ; CHECK-NOF16-NEXT:    selp.f32 %f9, %f2, %f4, %p2;
 ; CHECK-NOF16-NEXT:    selp.f32 %f10, %f1, %f3, %p1;
-; CHECK-NOF16-NEXT:    st.param.v2.f32 [func_retval0], {%f10, %f9};
+; CHECK-NOF16-NEXT:    st.param.v2.b32 [func_retval0], {%f10, %f9};
 ; CHECK-NOF16-NEXT:    ret;
                                            <2 x half> %c, <2 x half> %d) #0 {
   %cc = fcmp une <2 x half> %c, %d
@@ -675,8 +675,8 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
 ; CHECK-NEXT:    .reg .b32 %f<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f16_f32_param_3];
-; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f16_f32_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f16_f32_param_3];
+; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f16_f32_param_2];
 ; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_f16_f32_param_1];
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_f16_f32_param_0];
 ; CHECK-NEXT:    setp.neu.f32 %p1, %f1, %f3;
@@ -1388,7 +1388,7 @@ define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
 ; CHECK-NEXT:    cvt.rn.f16.u32 %rs1, %r2;
 ; CHECK-NEXT:    cvt.rn.f16.u32 %rs2, %r1;
 ; CHECK-NEXT:    mov.b32 %r3, {%rs2, %rs1};
@@ -1406,7 +1406,7 @@ define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
 ; CHECK-NEXT:    cvt.rn.f16.u64 %rs1, %rd2;
 ; CHECK-NEXT:    cvt.rn.f16.u64 %rs2, %rd1;
 ; CHECK-NEXT:    mov.b32 %r1, {%rs2, %rs1};
@@ -1423,7 +1423,7 @@ define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
 ; CHECK-NEXT:    cvt.rn.f16.s32 %rs1, %r2;
 ; CHECK-NEXT:    cvt.rn.f16.s32 %rs2, %r1;
 ; CHECK-NEXT:    mov.b32 %r3, {%rs2, %rs1};
@@ -1441,7 +1441,7 @@ define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
 ; CHECK-NEXT:    cvt.rn.f16.s64 %rs1, %rd2;
 ; CHECK-NEXT:    cvt.rn.f16.s64 %rs2, %rd1;
 ; CHECK-NEXT:    mov.b32 %r1, {%rs2, %rs1};
@@ -1459,7 +1459,7 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 ; CHECK-F16-NEXT:    .reg .b32 %r<6>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-F16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
 ; CHECK-F16-NEXT:    ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1];
 ; CHECK-F16-NEXT:    cvt.rn.f16.u32 %rs1, %r2;
 ; CHECK-F16-NEXT:    cvt.rn.f16.u32 %rs2, %r1;
@@ -1475,7 +1475,7 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %f<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1];
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.u32 %rs1, %r1;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.u32 %rs2, %r2;
@@ -1503,7 +1503,7 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 ; CHECK-F16-NEXT:    .reg .b32 %r<6>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
+; CHECK-F16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
 ; CHECK-F16-NEXT:    ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1];
 ; CHECK-F16-NEXT:    cvt.rn.f16.s32 %rs1, %r2;
 ; CHECK-F16-NEXT:    cvt.rn.f16.s32 %rs2, %r1;
@@ -1519,7 +1519,7 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %f<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
+; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1];
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.s32 %rs1, %r1;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.s32 %rs2, %r2;
@@ -1548,7 +1548,7 @@ define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs1, %f2;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs2, %f1;
 ; CHECK-NEXT:    mov.b32 %r1, {%rs2, %rs1};
@@ -1566,7 +1566,7 @@ define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
 ; CHECK-NEXT:    cvt.rn.f16.f64 %rs1, %fd2;
 ; CHECK-NEXT:    cvt.rn.f16.f64 %rs2, %fd1;
 ; CHECK-NEXT:    mov.b32 %r1, {%rs2, %rs1};
@@ -1588,7 +1588,7 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
 ; CHECK-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-NEXT:    st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%f2, %f1};
 ; CHECK-NEXT:    ret;
   %r = fpext <2 x half> %a to <2 x float>
   ret <2 x float> %r
@@ -1606,7 +1606,7 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f64.f16 %fd1, %rs2;
 ; CHECK-NEXT:    cvt.f64.f16 %fd2, %rs1;
-; CHECK-NEXT:    st.param.v2.f64 [func_retval0], {%fd2, %fd1};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%fd2, %fd1};
 ; CHECK-NEXT:    ret;
   %r = fpext <2 x half> %a to <2 x double>
   ret <2 x double> %r
@@ -1619,7 +1619,7 @@ define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_2xhalf_to_2xi16_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_2xhalf_to_2xi16_param_0];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %r = bitcast <2 x half> %a to <2 x i16>
@@ -1632,7 +1632,7 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_2xi16_to_2xhalf_param_0];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %r = bitcast <2 x i16> %a to <2 x half>
@@ -1646,7 +1646,7 @@ define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [test_bitcast_float_to_2xhalf_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [test_bitcast_float_to_2xhalf_param_0];
 ; CHECK-NEXT:    mov.b32 %r1, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -1661,9 +1661,9 @@ define float @test_bitcast_2xhalf_to_float(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_2xhalf_to_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_2xhalf_to_float_param_0];
 ; CHECK-NEXT:    mov.b32 %f1, %r1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f1;
 ; CHECK-NEXT:    ret;
   %r = bitcast <2 x half> %a to float
   ret float %r
@@ -1987,7 +1987,7 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
 ; CHECK-F16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1];
+; CHECK-F16-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_copysign_f32_param_1];
 ; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_copysign_f32_param_0];
 ; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs1, %f2;
 ; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs2, %f1;
@@ -2005,7 +2005,7 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1];
+; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%f1, %f2}, [test_copysign_f32_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f32_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, 32767;
@@ -2034,7 +2034,7 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
 ; CHECK-F16-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.v2.f64 {%fd1, %fd2}, [test_copysign_f64_param_1];
+; CHECK-F16-NEXT:    ld.param.v2.b64 {%fd1, %fd2}, [test_copysign_f64_param_1];
 ; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_copysign_f64_param_0];
 ; CHECK-F16-NEXT:    cvt.rn.f16.f64 %rs1, %fd2;
 ; CHECK-F16-NEXT:    cvt.rn.f16.f64 %rs2, %fd1;
@@ -2053,7 +2053,7 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.f64 {%fd1, %fd2}, [test_copysign_f64_param_1];
+; CHECK-NOF16-NEXT:    ld.param.v2.b64 {%fd1, %fd2}, [test_copysign_f64_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f64_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, 32767;
@@ -2092,7 +2092,7 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-F16-NEXT:    mov.b32 {%rs1, %rs2}, %r5;
 ; CHECK-F16-NEXT:    cvt.f32.f16 %f1, %rs2;
 ; CHECK-F16-NEXT:    cvt.f32.f16 %f2, %rs1;
-; CHECK-F16-NEXT:    st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-F16-NEXT:    st.param.v2.b32 [func_retval0], {%f2, %f1};
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-NOF16-LABEL: test_copysign_extended(
@@ -2114,7 +2114,7 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    or.b16 %rs10, %rs9, %rs8;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs10;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs7;
-; CHECK-NOF16-NEXT:    st.param.v2.f32 [func_retval0], {%f2, %f1};
+; CHECK-NOF16-NEXT:    st.param.v2.b32 [func_retval0], {%f2, %f1};
 ; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
   %xr = fpext <2 x half> %r to <2 x float>
@@ -2359,7 +2359,7 @@ define <2 x half> @test_sitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rn.f16.s16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rn.f16.s16 %rs4, %rs1;
@@ -2377,7 +2377,7 @@ define <2 x half> @test_uitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rn.f16.u16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rn.f16.u16 %rs4, %rs1;
diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
index 2c5c8146fbf61..939782eccff55 100644
--- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll
@@ -12,9 +12,9 @@ define float @ex2_float(float %0) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [ex2_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [ex2_float_param_0];
 ; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.nvvm.ex2.approx.f(float %0)
   ret float %res
@@ -27,9 +27,9 @@ define float @ex2_float_ftz(float %0) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [ex2_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [ex2_float_ftz_param_0];
 ; CHECK-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0)
   ret float %res
diff --git a/llvm/test/CodeGen/NVPTX/f32-lg2.ll b/llvm/test/CodeGen/NVPTX/f32-lg2.ll
index 9dac3083d6cb8..2b101bc3af43a 100644
--- a/llvm/test/CodeGen/NVPTX/f32-lg2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32-lg2.ll
@@ -13,9 +13,9 @@ define float @lg2_float(float %0) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [lg2_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [lg2_float_param_0];
 ; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.nvvm.lg2.approx.f(float %0)
   ret float %res
@@ -28,9 +28,9 @@ define float @lg2_float_ftz(float %0) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [lg2_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [lg2_float_ftz_param_0];
 ; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.nvvm.lg2.approx.ftz.f(float %0)
   ret float %res
diff --git a/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll b/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll
index d9c5a527b901c..51434f7566c14 100644
--- a/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll
@@ -21,9 +21,9 @@ define float @fabs_float(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [fabs_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [fabs_float_param_0];
 ; CHECK-NEXT:    abs.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %ret = call float @llvm.nvvm.fabs.f32(float %a)
   ret float %ret
@@ -35,9 +35,9 @@ define float @fabs_float_ftz(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [fabs_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [fabs_float_ftz_param_0];
 ; CHECK-NEXT:    abs.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %ret = call float @llvm.nvvm.fabs.ftz.f32(float %a)
   ret float %ret
@@ -49,9 +49,9 @@ define double @fabs_double(double %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [fabs_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [fabs_double_param_0];
 ; CHECK-NEXT:    abs.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
 ; CHECK-NEXT:    ret;
   %ret = call double @llvm.nvvm.fabs.f64(double %a)
   ret double %ret
diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll
index 4664d700209fa..c8940d9ae2a90 100644
--- a/llvm/test/CodeGen/NVPTX/fexp2.ll
+++ b/llvm/test/CodeGen/NVPTX/fexp2.ll
@@ -16,9 +16,9 @@ define float @exp2_test(float %in) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.f32 %f1, [exp2_test_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [exp2_test_param_0];
 ; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FP16-LABEL: exp2_test(
@@ -26,9 +26,9 @@ define float @exp2_test(float %in) {
 ; CHECK-FP16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-FP16-EMPTY:
 ; CHECK-FP16-NEXT:  // %bb.0: // %entry
-; CHECK-FP16-NEXT:    ld.param.f32 %f1, [exp2_test_param_0];
+; CHECK-FP16-NEXT:    ld.param.b32 %f1, [exp2_test_param_0];
 ; CHECK-FP16-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-FP16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-FP16-NEXT:    ret;
 ;
 ; CHECK-BF16-LABEL: exp2_test(
@@ -36,9 +36,9 @@ define float @exp2_test(float %in) {
 ; CHECK-BF16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-BF16-EMPTY:
 ; CHECK-BF16-NEXT:  // %bb.0: // %entry
-; CHECK-BF16-NEXT:    ld.param.f32 %f1, [exp2_test_param_0];
+; CHECK-BF16-NEXT:    ld.param.b32 %f1, [exp2_test_param_0];
 ; CHECK-BF16-NEXT:    ex2.approx.f32 %f2, %f1;
-; CHECK-BF16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-BF16-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-BF16-NEXT:    ret;
 entry:
   %exp2 = call float @llvm.exp2.f32(float %in)
@@ -52,9 +52,9 @@ define float @exp2_ftz_test(float %in) #0 {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.f32 %f1, [exp2_ftz_test_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [exp2_ftz_test_param_0];
 ; CHECK-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FP16-LABEL: exp2_ftz_test(
@@ -62,9 +62,9 @@ define float @exp2_ftz_test(float %in) #0 {
 ; CHECK-FP16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-FP16-EMPTY:
 ; CHECK-FP16-NEXT:  // %bb.0: // %entry
-; CHECK-FP16-NEXT:    ld.param.f32 %f1, [exp2_ftz_test_param_0];
+; CHECK-FP16-NEXT:    ld.param.b32 %f1, [exp2_ftz_test_param_0];
 ; CHECK-FP16-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
-; CHECK-FP16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-FP16-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-FP16-NEXT:    ret;
 ;
 ; CHECK-BF16-LABEL: exp2_ftz_test(
@@ -72,9 +72,9 @@ define float @exp2_ftz_test(float %in) #0 {
 ; CHECK-BF16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-BF16-EMPTY:
 ; CHECK-BF16-NEXT:  // %bb.0: // %entry
-; CHECK-BF16-NEXT:    ld.param.f32 %f1, [exp2_ftz_test_param_0];
+; CHECK-BF16-NEXT:    ld.param.b32 %f1, [exp2_ftz_test_param_0];
 ; CHECK-BF16-NEXT:    ex2.approx.ftz.f32 %f2, %f1;
-; CHECK-BF16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-BF16-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-BF16-NEXT:    ret;
 entry:
   %exp2 = call float @llvm.exp2.f32(float %in)
@@ -88,10 +88,10 @@ define <2 x float> @exp2_test_v(<2 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [exp2_test_v_param_0];
 ; CHECK-NEXT:    ex2.approx.f32 %f3, %f2;
 ; CHECK-NEXT:    ex2.approx.f32 %f4, %f1;
-; CHECK-NEXT:    st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%f4, %f3};
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FP16-LABEL: exp2_test_v(
@@ -99,10 +99,10 @@ define <2 x float> @exp2_test_v(<2 x float> %in) {
 ; CHECK-FP16-NEXT:    .reg .b32 %f<5>;
 ; CHECK-FP16-EMPTY:
 ; CHECK-FP16-NEXT:  // %bb.0: // %entry
-; CHECK-FP16-NEXT:    ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0];
+; CHECK-FP16-NEXT:    ld.param.v2.b32 {%f1, %f2}, [exp2_test_v_param_0];
 ; CHECK-FP16-NEXT:    ex2.approx.f32 %f3, %f2;
 ; CHECK-FP16-NEXT:    ex2.approx.f32 %f4, %f1;
-; CHECK-FP16-NEXT:    st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-FP16-NEXT:    st.param.v2.b32 [func_retval0], {%f4, %f3};
 ; CHECK-FP16-NEXT:    ret;
 ;
 ; CHECK-BF16-LABEL: exp2_test_v(
@@ -110,10 +110,10 @@ define <2 x float> @exp2_test_v(<2 x float> %in) {
 ; CHECK-BF16-NEXT:    .reg .b32 %f<5>;
 ; CHECK-BF16-EMPTY:
 ; CHECK-BF16-NEXT:  // %bb.0: // %entry
-; CHECK-BF16-NEXT:    ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0];
+; CHECK-BF16-NEXT:    ld.param.v2.b32 {%f1, %f2}, [exp2_test_v_param_0];
 ; CHECK-BF16-NEXT:    ex2.approx.f32 %f3, %f2;
 ; CHECK-BF16-NEXT:    ex2.approx.f32 %f4, %f1;
-; CHECK-BF16-NEXT:    st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-BF16-NEXT:    st.param.v2.b32 [func_retval0], {%f4, %f3};
 ; CHECK-BF16-NEXT:    ret;
 entry:
   %exp2 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in)
@@ -259,7 +259,7 @@ define bfloat @exp2_bf16_test(bfloat %in) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u16 %r1, [exp2_bf16_test_param_0];
+; CHECK-NEXT:    ld.param.b16 %r1, [exp2_bf16_test_param_0];
 ; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-NEXT:    mov.b32 %f1, %r2;
 ; CHECK-NEXT:    ex2.approx.f32 %f2, %f1;
@@ -282,7 +282,7 @@ define bfloat @exp2_bf16_test(bfloat %in) {
 ; CHECK-FP16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-FP16-EMPTY:
 ; CHECK-FP16-NEXT:  // %bb.0: // %entry
-; CHECK-FP16-NEXT:    ld.param.u16 %r1, [exp2_bf16_test_param_0];
+; CHECK-FP16-NEXT:    ld.param.b16 %r1, [exp2_bf16_test_param_0];
 ; CHECK-FP16-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-FP16-NEXT:    mov.b32 %f1, %r2;
 ; CHECK-FP16-NEXT:    ex2.approx.f32 %f2, %f1;
diff --git a/llvm/test/CodeGen/NVPTX/flo.ll b/llvm/test/CodeGen/NVPTX/flo.ll
index bc7f765e40ab4..fc4e30439caf2 100644
--- a/llvm/test/CodeGen/NVPTX/flo.ll
+++ b/llvm/test/CodeGen/NVPTX/flo.ll
@@ -10,7 +10,7 @@ define i32 @flo_1(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [flo_1_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [flo_1_param_0];
 ; CHECK-NEXT:    bfind.s32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -25,7 +25,7 @@ define i32 @flo_2(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [flo_2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [flo_2_param_0];
 ; CHECK-NEXT:    bfind.shiftamt.s32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -39,7 +39,7 @@ define i32 @flo_3(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [flo_3_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [flo_3_param_0];
 ; CHECK-NEXT:    bfind.u32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -54,7 +54,7 @@ define i32 @flo_4(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [flo_4_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [flo_4_param_0];
 ; CHECK-NEXT:    bfind.shiftamt.u32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -71,7 +71,7 @@ define i32 @flo_5(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [flo_5_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [flo_5_param_0];
 ; CHECK-NEXT:    bfind.s64 %r1, %rd1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -87,7 +87,7 @@ define i32 @flo_6(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [flo_6_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [flo_6_param_0];
 ; CHECK-NEXT:    bfind.shiftamt.s64 %r1, %rd1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -102,7 +102,7 @@ define i32 @flo_7(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [flo_7_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [flo_7_param_0];
 ; CHECK-NEXT:    bfind.u64 %r1, %rd1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -118,7 +118,7 @@ define i32 @flo_8(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [flo_8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [flo_8_param_0];
 ; CHECK-NEXT:    bfind.shiftamt.u64 %r1, %rd1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll
index 4dfed3dd944ae..d922e18edc165 100644
--- a/llvm/test/CodeGen/NVPTX/flog2.ll
+++ b/llvm/test/CodeGen/NVPTX/flog2.ll
@@ -10,9 +10,9 @@ define float @log2_test(float %in) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.f32 %f1, [log2_test_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [log2_test_param_0];
 ; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
 entry:
   %log2 = call float @llvm.log2.f32(float %in)
@@ -26,9 +26,9 @@ define float @log2_ftz_test(float %in) #0 {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.f32 %f1, [log2_ftz_test_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [log2_ftz_test_param_0];
 ; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
 entry:
   %log2 = call float @llvm.log2.f32(float %in)
@@ -42,10 +42,10 @@ define <2 x float> @log2_test_v(<2 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.v2.f32 {%f1, %f2}, [log2_test_v_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%f1, %f2}, [log2_test_v_param_0];
 ; CHECK-NEXT:    lg2.approx.f32 %f3, %f2;
 ; CHECK-NEXT:    lg2.approx.f32 %f4, %f1;
-; CHECK-NEXT:    st.param.v2.f32 [func_retval0], {%f4, %f3};
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%f4, %f3};
 ; CHECK-NEXT:    ret;
 entry:
   %log2 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in)
@@ -129,7 +129,7 @@ define bfloat @log2_bf16_test(bfloat %in) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u16 %r1, [log2_bf16_test_param_0];
+; CHECK-NEXT:    ld.param.b16 %r1, [log2_bf16_test_param_0];
 ; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-NEXT:    mov.b32 %f1, %r2;
 ; CHECK-NEXT:    lg2.approx.f32 %f2, %f1;
@@ -158,7 +158,7 @@ define bfloat @log2_bf16_ftz_test(bfloat %in) #0 {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u16 %r1, [log2_bf16_ftz_test_param_0];
+; CHECK-NEXT:    ld.param.b16 %r1, [log2_bf16_ftz_test_param_0];
 ; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-NEXT:    mov.b32 %f1, %r2;
 ; CHECK-NEXT:    lg2.approx.ftz.f32 %f2, %f1;
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index 9051a0bce14cd..b971d2f237b40 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -253,13 +253,13 @@ define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %
 ; CHECK-SM70-NEXT:    .reg .b32 %f<6>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_unsafe_with_nans_param_2];
+; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_unsafe_with_nans_param_2];
 ; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_unsafe_with_nans_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_unsafe_with_nans_param_1];
 ; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_unsafe_with_nans_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_unsafe_with_nans_param_0];
 ; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -317,13 +317,13 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-SM70-NEXT:    .reg .b32 %f<6>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_no_nans_param_2];
+; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_no_nans_param_2];
 ; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_no_nans_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_no_nans_param_1];
 ; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_no_nans_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_no_nans_param_0];
 ; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -405,13 +405,13 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-SM70-NEXT:    .reg .b32 %f<10>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
+; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
 ; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
 ; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -493,13 +493,13 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)
 ; CHECK-SM70-NEXT:    .reg .b32 %f<7>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2];
+; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2];
 ; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1];
 ; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0];
 ; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
index 73f808f1e06ee..d1081de000dba 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
@@ -187,13 +187,13 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-SM70-NEXT:    .reg .b32 %f<6>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_no_nans_param_2];
+; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_no_nans_param_2];
 ; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_no_nans_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_no_nans_param_1];
 ; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_no_nans_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_no_nans_param_0];
 ; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -267,13 +267,13 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
 ; CHECK-SM70-NEXT:    .reg .b32 %f<9>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
+; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
 ; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
 ; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -348,13 +348,13 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-SM70-NEXT:    .reg .b32 %f<7>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_maxnum_no_nans_param_2];
+; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_maxnum_no_nans_param_2];
 ; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_maxnum_no_nans_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_maxnum_no_nans_param_1];
 ; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_maxnum_no_nans_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_maxnum_no_nans_param_0];
 ; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
index b94fa5a24b502..05f7840dc3aa0 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
@@ -198,13 +198,13 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-SM70-NEXT:    .reg .b32 %f<6>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_no_nans_param_2];
+; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_no_nans_param_2];
 ; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_no_nans_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_no_nans_param_1];
 ; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_no_nans_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_no_nans_param_0];
 ; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -286,13 +286,13 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-SM70-NEXT:    .reg .b32 %f<10>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
+; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
 ; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
 ; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -376,13 +376,13 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)
 ; CHECK-SM70-NEXT:    .reg .b32 %f<7>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2];
+; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2];
 ; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1];
 ; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0];
 ; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -1134,13 +1134,13 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-SM70-NEXT:    .reg .b32 %f<6>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_no_nans_param_2];
+; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_no_nans_param_2];
 ; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_no_nans_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_no_nans_param_1];
 ; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_no_nans_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_no_nans_param_0];
 ; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -1214,13 +1214,13 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
 ; CHECK-SM70-NEXT:    .reg .b32 %f<9>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
+; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
 ; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
 ; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
 ; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -1295,13 +1295,13 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-SM70-NEXT:    .reg .b32 %f<7>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
-; CHECK-SM70-NEXT:    ld.param.u16 %r1, [fma_bf16_maxnum_no_nans_param_2];
+; CHECK-SM70-NEXT:    ld.param.b16 %r1, [fma_bf16_maxnum_no_nans_param_2];
 ; CHECK-SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r2;
-; CHECK-SM70-NEXT:    ld.param.u16 %r3, [fma_bf16_maxnum_no_nans_param_1];
+; CHECK-SM70-NEXT:    ld.param.b16 %r3, [fma_bf16_maxnum_no_nans_param_1];
 ; CHECK-SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r4;
-; CHECK-SM70-NEXT:    ld.param.u16 %r5, [fma_bf16_maxnum_no_nans_param_0];
+; CHECK-SM70-NEXT:    ld.param.b16 %r5, [fma_bf16_maxnum_no_nans_param_0];
 ; CHECK-SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r6;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
diff --git a/llvm/test/CodeGen/NVPTX/fns.ll b/llvm/test/CodeGen/NVPTX/fns.ll
index c51f9aee551c7..b153e298bbff7 100644
--- a/llvm/test/CodeGen/NVPTX/fns.ll
+++ b/llvm/test/CodeGen/NVPTX/fns.ll
@@ -5,9 +5,9 @@ declare i32 @llvm.nvvm.fns(i32, i32, i32)
 
 ; CHECK-LABEL: .func{{.*}}fns
 define i32 @fns(i32 %mask, i32 %base, i32 %offset) {
-  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [fns_param_0];
-  ; CHECK: ld.param.u32 	[[BASE:%r[0-9]+]], [fns_param_1];
-  ; CHECK: ld.param.u32 	[[OFFSET:%r[0-9]+]], [fns_param_2];
+  ; CHECK: ld.param.b32 	[[MASK:%r[0-9]+]], [fns_param_0];
+  ; CHECK: ld.param.b32 	[[BASE:%r[0-9]+]], [fns_param_1];
+  ; CHECK: ld.param.b32 	[[OFFSET:%r[0-9]+]], [fns_param_2];
 
   ; CHECK:  fns.b32 	{{%r[0-9]+}}, [[MASK]], [[BASE]], [[OFFSET]];
   %r0 = call i32 @llvm.nvvm.fns(i32 %mask, i32 %base, i32 %offset);
diff --git a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
index 80ae8aac39115..d253df5ed1b9c 100644
--- a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
+++ b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
@@ -10,7 +10,7 @@ define i32 @test_ld_param_const(ptr byval(i32) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_ld_param_const_param_0+4];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_ld_param_const_param_0+4];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %p2 = getelementptr i32, ptr %a, i32 1
@@ -28,7 +28,7 @@ define i32 @test_ld_param_non_const(ptr byval([10 x i32]) %a, i32 %b) {
 ; CHECK-NEXT:    mov.b64 %rd1, test_ld_param_non_const_param_0;
 ; CHECK-NEXT:    ld.param.s32 %rd2, [test_ld_param_non_const_param_1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd1, %rd2;
-; CHECK-NEXT:    ld.local.u32 %r1, [%rd3];
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd3];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %p2 = getelementptr i8, ptr %a, i32 %b
@@ -68,7 +68,7 @@ define void @test_ld_param_byval(ptr byval(i32) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_ld_param_byval_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_ld_param_byval_param_0];
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.b32 [param0], %r1;
@@ -91,9 +91,9 @@ define i32 @test_modify_param(ptr byval([10 x i32]) %a, i32 %b, i32 %c ) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    mov.b64 %rd1, test_modify_param_param_0;
-; CHECK-NEXT:    ld.param.u32 %r1, [test_modify_param_param_1];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_modify_param_param_2];
-; CHECK-NEXT:    st.local.u32 [%rd1+2], %r1;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_modify_param_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_modify_param_param_2];
+; CHECK-NEXT:    st.local.b32 [%rd1+2], %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %p2 = getelementptr i8, ptr %a, i32 2
@@ -110,16 +110,16 @@ define i32 @test_multi_block(ptr byval([10 x i32]) %a, i1 %p) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [test_multi_block_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_multi_block_param_1];
 ; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; CHECK-NEXT:    not.pred %p2, %p1;
 ; CHECK-NEXT:    @%p2 bra $L__BB5_2;
 ; CHECK-NEXT:  // %bb.1: // %if
-; CHECK-NEXT:    ld.param.u32 %r4, [test_multi_block_param_0+4];
+; CHECK-NEXT:    ld.param.b32 %r4, [test_multi_block_param_0+4];
 ; CHECK-NEXT:    bra.uni $L__BB5_3;
 ; CHECK-NEXT:  $L__BB5_2: // %else
-; CHECK-NEXT:    ld.param.u32 %r4, [test_multi_block_param_0+8];
+; CHECK-NEXT:    ld.param.b32 %r4, [test_multi_block_param_0+8];
 ; CHECK-NEXT:  $L__BB5_3: // %end
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/fp-contract.ll b/llvm/test/CodeGen/NVPTX/fp-contract.ll
index bd559ea157feb..89a402db8e42a 100644
--- a/llvm/test/CodeGen/NVPTX/fp-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fp-contract.ll
@@ -18,11 +18,11 @@ define float @t0(float %a, float %b, float %c) {
 ; FAST-NEXT:    .reg .b32 %f<5>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.f32 %f1, [t0_param_0];
-; FAST-NEXT:    ld.param.f32 %f2, [t0_param_1];
-; FAST-NEXT:    ld.param.f32 %f3, [t0_param_2];
+; FAST-NEXT:    ld.param.b32 %f1, [t0_param_0];
+; FAST-NEXT:    ld.param.b32 %f2, [t0_param_1];
+; FAST-NEXT:    ld.param.b32 %f3, [t0_param_2];
 ; FAST-NEXT:    fma.rn.f32 %f4, %f1, %f2, %f3;
-; FAST-NEXT:    st.param.f32 [func_retval0], %f4;
+; FAST-NEXT:    st.param.b32 [func_retval0], %f4;
 ; FAST-NEXT:    ret;
 ;
 ; DEFAULT-LABEL: t0(
@@ -30,12 +30,12 @@ define float @t0(float %a, float %b, float %c) {
 ; DEFAULT-NEXT:    .reg .b32 %f<6>;
 ; DEFAULT-EMPTY:
 ; DEFAULT-NEXT:  // %bb.0:
-; DEFAULT-NEXT:    ld.param.f32 %f1, [t0_param_0];
-; DEFAULT-NEXT:    ld.param.f32 %f2, [t0_param_1];
+; DEFAULT-NEXT:    ld.param.b32 %f1, [t0_param_0];
+; DEFAULT-NEXT:    ld.param.b32 %f2, [t0_param_1];
 ; DEFAULT-NEXT:    mul.rn.f32 %f3, %f1, %f2;
-; DEFAULT-NEXT:    ld.param.f32 %f4, [t0_param_2];
+; DEFAULT-NEXT:    ld.param.b32 %f4, [t0_param_2];
 ; DEFAULT-NEXT:    add.rn.f32 %f5, %f3, %f4;
-; DEFAULT-NEXT:    st.param.f32 [func_retval0], %f5;
+; DEFAULT-NEXT:    st.param.b32 [func_retval0], %f5;
 ; DEFAULT-NEXT:    ret;
   %v0 = fmul float %a, %b
   %v1 = fadd float %v0, %c
@@ -50,12 +50,12 @@ define float @t1(float %a, float %b) {
 ; FAST-NEXT:    .reg .b32 %f<6>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.f32 %f1, [t1_param_0];
-; FAST-NEXT:    ld.param.f32 %f2, [t1_param_1];
+; FAST-NEXT:    ld.param.b32 %f1, [t1_param_0];
+; FAST-NEXT:    ld.param.b32 %f2, [t1_param_1];
 ; FAST-NEXT:    add.f32 %f3, %f1, %f2;
 ; FAST-NEXT:    sub.f32 %f4, %f1, %f2;
 ; FAST-NEXT:    mul.f32 %f5, %f3, %f4;
-; FAST-NEXT:    st.param.f32 [func_retval0], %f5;
+; FAST-NEXT:    st.param.b32 [func_retval0], %f5;
 ; FAST-NEXT:    ret;
 ;
 ; DEFAULT-LABEL: t1(
@@ -63,12 +63,12 @@ define float @t1(float %a, float %b) {
 ; DEFAULT-NEXT:    .reg .b32 %f<6>;
 ; DEFAULT-EMPTY:
 ; DEFAULT-NEXT:  // %bb.0:
-; DEFAULT-NEXT:    ld.param.f32 %f1, [t1_param_0];
-; DEFAULT-NEXT:    ld.param.f32 %f2, [t1_param_1];
+; DEFAULT-NEXT:    ld.param.b32 %f1, [t1_param_0];
+; DEFAULT-NEXT:    ld.param.b32 %f2, [t1_param_1];
 ; DEFAULT-NEXT:    add.rn.f32 %f3, %f1, %f2;
 ; DEFAULT-NEXT:    sub.rn.f32 %f4, %f1, %f2;
 ; DEFAULT-NEXT:    mul.rn.f32 %f5, %f3, %f4;
-; DEFAULT-NEXT:    st.param.f32 [func_retval0], %f5;
+; DEFAULT-NEXT:    st.param.b32 [func_retval0], %f5;
 ; DEFAULT-NEXT:    ret;
   %v1 = fadd float %a, %b
   %v2 = fsub float %a, %b
@@ -84,12 +84,12 @@ define float @t2(float %a, float %b) {
 ; CHECK-NEXT:    .reg .b32 %f<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [t2_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [t2_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [t2_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [t2_param_1];
 ; CHECK-NEXT:    add.f32 %f3, %f1, %f2;
 ; CHECK-NEXT:    sub.f32 %f4, %f1, %f2;
 ; CHECK-NEXT:    mul.f32 %f5, %f3, %f4;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f5;
 ; CHECK-NEXT:    ret;
   %v1 = fadd contract float %a, %b
   %v2 = fsub contract float %a, %b
@@ -104,11 +104,11 @@ define float @t3(float %a, float %b, float %c) {
 ; CHECK-NEXT:    .reg .b32 %f<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [t3_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [t3_param_1];
-; CHECK-NEXT:    ld.param.f32 %f3, [t3_param_2];
+; CHECK-NEXT:    ld.param.b32 %f1, [t3_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [t3_param_1];
+; CHECK-NEXT:    ld.param.b32 %f3, [t3_param_2];
 ; CHECK-NEXT:    fma.rn.f32 %f4, %f1, %f2, %f3;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f4;
 ; CHECK-NEXT:    ret;
   %v0 = fmul contract float %a, %b
   %v1 = fadd contract float %v0, %c
diff --git a/llvm/test/CodeGen/NVPTX/fp128-storage-type.ll b/llvm/test/CodeGen/NVPTX/fp128-storage-type.ll
index fa8cc6e23b6b4..d40f514acd408 100644
--- a/llvm/test/CodeGen/NVPTX/fp128-storage-type.ll
+++ b/llvm/test/CodeGen/NVPTX/fp128-storage-type.ll
@@ -10,7 +10,7 @@ define fp128 @identity(fp128 %x) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [identity_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [identity_param_0];
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
 ; CHECK-NEXT:    ret;
   ret fp128 %x
@@ -22,10 +22,10 @@ define void @load_store(ptr %in, ptr %out) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [load_store_param_0];
-; CHECK-NEXT:    ld.v2.u64 {%rd2, %rd3}, [%rd1];
-; CHECK-NEXT:    ld.param.u64 %rd4, [load_store_param_1];
-; CHECK-NEXT:    st.v2.u64 [%rd4], {%rd2, %rd3};
+; CHECK-NEXT:    ld.param.b64 %rd1, [load_store_param_0];
+; CHECK-NEXT:    ld.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd4, [load_store_param_1];
+; CHECK-NEXT:    st.v2.b64 [%rd4], {%rd2, %rd3};
 ; CHECK-NEXT:    ret;
   %val = load fp128, ptr %in
   store fp128 %val, ptr %out
@@ -38,7 +38,7 @@ define void @call(fp128 %x) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [call_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [call_param_0];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.b64 [param0], {%rd1, %rd2};
diff --git a/llvm/test/CodeGen/NVPTX/frem.ll b/llvm/test/CodeGen/NVPTX/frem.ll
index 4077f6d1eb21b..c0658f85205e8 100644
--- a/llvm/test/CodeGen/NVPTX/frem.ll
+++ b/llvm/test/CodeGen/NVPTX/frem.ll
@@ -54,13 +54,13 @@ define float @frem_f32(float %a, float %b) {
 ; FAST-NEXT:    .reg .b32 %f<7>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.f32 %f1, [frem_f32_param_0];
-; FAST-NEXT:    ld.param.f32 %f2, [frem_f32_param_1];
+; FAST-NEXT:    ld.param.b32 %f1, [frem_f32_param_0];
+; FAST-NEXT:    ld.param.b32 %f2, [frem_f32_param_1];
 ; FAST-NEXT:    div.approx.f32 %f3, %f1, %f2;
 ; FAST-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
 ; FAST-NEXT:    neg.f32 %f5, %f4;
 ; FAST-NEXT:    fma.rn.f32 %f6, %f5, %f2, %f1;
-; FAST-NEXT:    st.param.f32 [func_retval0], %f6;
+; FAST-NEXT:    st.param.b32 [func_retval0], %f6;
 ; FAST-NEXT:    ret;
 ;
 ; NORMAL-LABEL: frem_f32(
@@ -69,15 +69,15 @@ define float @frem_f32(float %a, float %b) {
 ; NORMAL-NEXT:    .reg .b32 %f<8>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.f32 %f1, [frem_f32_param_0];
-; NORMAL-NEXT:    ld.param.f32 %f2, [frem_f32_param_1];
+; NORMAL-NEXT:    ld.param.b32 %f1, [frem_f32_param_0];
+; NORMAL-NEXT:    ld.param.b32 %f2, [frem_f32_param_1];
 ; NORMAL-NEXT:    div.rn.f32 %f3, %f1, %f2;
 ; NORMAL-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
 ; NORMAL-NEXT:    neg.f32 %f5, %f4;
 ; NORMAL-NEXT:    fma.rn.f32 %f6, %f5, %f2, %f1;
 ; NORMAL-NEXT:    testp.infinite.f32 %p1, %f2;
 ; NORMAL-NEXT:    selp.f32 %f7, %f1, %f6, %p1;
-; NORMAL-NEXT:    st.param.f32 [func_retval0], %f7;
+; NORMAL-NEXT:    st.param.b32 [func_retval0], %f7;
 ; NORMAL-NEXT:    ret;
   %r = frem float %a, %b
   ret float %r
@@ -89,13 +89,13 @@ define double @frem_f64(double %a, double %b) {
 ; FAST-NEXT:    .reg .b64 %fd<7>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.f64 %fd1, [frem_f64_param_0];
-; FAST-NEXT:    ld.param.f64 %fd2, [frem_f64_param_1];
+; FAST-NEXT:    ld.param.b64 %fd1, [frem_f64_param_0];
+; FAST-NEXT:    ld.param.b64 %fd2, [frem_f64_param_1];
 ; FAST-NEXT:    div.rn.f64 %fd3, %fd1, %fd2;
 ; FAST-NEXT:    cvt.rzi.f64.f64 %fd4, %fd3;
 ; FAST-NEXT:    neg.f64 %fd5, %fd4;
 ; FAST-NEXT:    fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
-; FAST-NEXT:    st.param.f64 [func_retval0], %fd6;
+; FAST-NEXT:    st.param.b64 [func_retval0], %fd6;
 ; FAST-NEXT:    ret;
 ;
 ; NORMAL-LABEL: frem_f64(
@@ -104,15 +104,15 @@ define double @frem_f64(double %a, double %b) {
 ; NORMAL-NEXT:    .reg .b64 %fd<8>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.f64 %fd1, [frem_f64_param_0];
-; NORMAL-NEXT:    ld.param.f64 %fd2, [frem_f64_param_1];
+; NORMAL-NEXT:    ld.param.b64 %fd1, [frem_f64_param_0];
+; NORMAL-NEXT:    ld.param.b64 %fd2, [frem_f64_param_1];
 ; NORMAL-NEXT:    div.rn.f64 %fd3, %fd1, %fd2;
 ; NORMAL-NEXT:    cvt.rzi.f64.f64 %fd4, %fd3;
 ; NORMAL-NEXT:    neg.f64 %fd5, %fd4;
 ; NORMAL-NEXT:    fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
 ; NORMAL-NEXT:    testp.infinite.f64 %p1, %fd2;
 ; NORMAL-NEXT:    selp.f64 %fd7, %fd1, %fd6, %p1;
-; NORMAL-NEXT:    st.param.f64 [func_retval0], %fd7;
+; NORMAL-NEXT:    st.param.b64 [func_retval0], %fd7;
 ; NORMAL-NEXT:    ret;
   %r = frem double %a, %b
   ret double %r
@@ -164,13 +164,13 @@ define float @frem_f32_ninf(float %a, float %b) {
 ; FAST-NEXT:    .reg .b32 %f<7>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.f32 %f1, [frem_f32_ninf_param_0];
-; FAST-NEXT:    ld.param.f32 %f2, [frem_f32_ninf_param_1];
+; FAST-NEXT:    ld.param.b32 %f1, [frem_f32_ninf_param_0];
+; FAST-NEXT:    ld.param.b32 %f2, [frem_f32_ninf_param_1];
 ; FAST-NEXT:    div.approx.f32 %f3, %f1, %f2;
 ; FAST-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
 ; FAST-NEXT:    neg.f32 %f5, %f4;
 ; FAST-NEXT:    fma.rn.f32 %f6, %f5, %f2, %f1;
-; FAST-NEXT:    st.param.f32 [func_retval0], %f6;
+; FAST-NEXT:    st.param.b32 [func_retval0], %f6;
 ; FAST-NEXT:    ret;
 ;
 ; NORMAL-LABEL: frem_f32_ninf(
@@ -178,13 +178,13 @@ define float @frem_f32_ninf(float %a, float %b) {
 ; NORMAL-NEXT:    .reg .b32 %f<7>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.f32 %f1, [frem_f32_ninf_param_0];
-; NORMAL-NEXT:    ld.param.f32 %f2, [frem_f32_ninf_param_1];
+; NORMAL-NEXT:    ld.param.b32 %f1, [frem_f32_ninf_param_0];
+; NORMAL-NEXT:    ld.param.b32 %f2, [frem_f32_ninf_param_1];
 ; NORMAL-NEXT:    div.rn.f32 %f3, %f1, %f2;
 ; NORMAL-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
 ; NORMAL-NEXT:    neg.f32 %f5, %f4;
 ; NORMAL-NEXT:    fma.rn.f32 %f6, %f5, %f2, %f1;
-; NORMAL-NEXT:    st.param.f32 [func_retval0], %f6;
+; NORMAL-NEXT:    st.param.b32 [func_retval0], %f6;
 ; NORMAL-NEXT:    ret;
   %r = frem ninf float %a, %b
   ret float %r
@@ -196,13 +196,13 @@ define double @frem_f64_ninf(double %a, double %b) {
 ; FAST-NEXT:    .reg .b64 %fd<7>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.f64 %fd1, [frem_f64_ninf_param_0];
-; FAST-NEXT:    ld.param.f64 %fd2, [frem_f64_ninf_param_1];
+; FAST-NEXT:    ld.param.b64 %fd1, [frem_f64_ninf_param_0];
+; FAST-NEXT:    ld.param.b64 %fd2, [frem_f64_ninf_param_1];
 ; FAST-NEXT:    div.rn.f64 %fd3, %fd1, %fd2;
 ; FAST-NEXT:    cvt.rzi.f64.f64 %fd4, %fd3;
 ; FAST-NEXT:    neg.f64 %fd5, %fd4;
 ; FAST-NEXT:    fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
-; FAST-NEXT:    st.param.f64 [func_retval0], %fd6;
+; FAST-NEXT:    st.param.b64 [func_retval0], %fd6;
 ; FAST-NEXT:    ret;
 ;
 ; NORMAL-LABEL: frem_f64_ninf(
@@ -210,13 +210,13 @@ define double @frem_f64_ninf(double %a, double %b) {
 ; NORMAL-NEXT:    .reg .b64 %fd<7>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.f64 %fd1, [frem_f64_ninf_param_0];
-; NORMAL-NEXT:    ld.param.f64 %fd2, [frem_f64_ninf_param_1];
+; NORMAL-NEXT:    ld.param.b64 %fd1, [frem_f64_ninf_param_0];
+; NORMAL-NEXT:    ld.param.b64 %fd2, [frem_f64_ninf_param_1];
 ; NORMAL-NEXT:    div.rn.f64 %fd3, %fd1, %fd2;
 ; NORMAL-NEXT:    cvt.rzi.f64.f64 %fd4, %fd3;
 ; NORMAL-NEXT:    neg.f64 %fd5, %fd4;
 ; NORMAL-NEXT:    fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
-; NORMAL-NEXT:    st.param.f64 [func_retval0], %fd6;
+; NORMAL-NEXT:    st.param.b64 [func_retval0], %fd6;
 ; NORMAL-NEXT:    ret;
   %r = frem ninf double %a, %b
   ret double %r
@@ -228,11 +228,11 @@ define float @frem_f32_imm1(float %a) {
 ; FAST-NEXT:    .reg .b32 %f<5>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.f32 %f1, [frem_f32_imm1_param_0];
+; FAST-NEXT:    ld.param.b32 %f1, [frem_f32_imm1_param_0];
 ; FAST-NEXT:    mul.f32 %f2, %f1, 0f3E124925;
 ; FAST-NEXT:    cvt.rzi.f32.f32 %f3, %f2;
 ; FAST-NEXT:    fma.rn.f32 %f4, %f3, 0fC0E00000, %f1;
-; FAST-NEXT:    st.param.f32 [func_retval0], %f4;
+; FAST-NEXT:    st.param.b32 [func_retval0], %f4;
 ; FAST-NEXT:    ret;
 ;
 ; NORMAL-LABEL: frem_f32_imm1(
@@ -240,11 +240,11 @@ define float @frem_f32_imm1(float %a) {
 ; NORMAL-NEXT:    .reg .b32 %f<5>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.f32 %f1, [frem_f32_imm1_param_0];
+; NORMAL-NEXT:    ld.param.b32 %f1, [frem_f32_imm1_param_0];
 ; NORMAL-NEXT:    div.rn.f32 %f2, %f1, 0f40E00000;
 ; NORMAL-NEXT:    cvt.rzi.f32.f32 %f3, %f2;
 ; NORMAL-NEXT:    fma.rn.f32 %f4, %f3, 0fC0E00000, %f1;
-; NORMAL-NEXT:    st.param.f32 [func_retval0], %f4;
+; NORMAL-NEXT:    st.param.b32 [func_retval0], %f4;
 ; NORMAL-NEXT:    ret;
   %r = frem float %a, 7.0
   ret float %r
@@ -256,13 +256,13 @@ define float @frem_f32_imm2(float %a) {
 ; FAST-NEXT:    .reg .b32 %f<7>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.f32 %f1, [frem_f32_imm2_param_0];
+; FAST-NEXT:    ld.param.b32 %f1, [frem_f32_imm2_param_0];
 ; FAST-NEXT:    mov.b32 %f2, 0f40E00000;
 ; FAST-NEXT:    div.approx.f32 %f3, %f2, %f1;
 ; FAST-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
 ; FAST-NEXT:    neg.f32 %f5, %f4;
 ; FAST-NEXT:    fma.rn.f32 %f6, %f5, %f1, 0f40E00000;
-; FAST-NEXT:    st.param.f32 [func_retval0], %f6;
+; FAST-NEXT:    st.param.b32 [func_retval0], %f6;
 ; FAST-NEXT:    ret;
 ;
 ; NORMAL-LABEL: frem_f32_imm2(
@@ -271,7 +271,7 @@ define float @frem_f32_imm2(float %a) {
 ; NORMAL-NEXT:    .reg .b32 %f<8>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.f32 %f1, [frem_f32_imm2_param_0];
+; NORMAL-NEXT:    ld.param.b32 %f1, [frem_f32_imm2_param_0];
 ; NORMAL-NEXT:    mov.b32 %f2, 0f40E00000;
 ; NORMAL-NEXT:    div.rn.f32 %f3, %f2, %f1;
 ; NORMAL-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
@@ -279,7 +279,7 @@ define float @frem_f32_imm2(float %a) {
 ; NORMAL-NEXT:    fma.rn.f32 %f6, %f5, %f1, 0f40E00000;
 ; NORMAL-NEXT:    testp.infinite.f32 %p1, %f1;
 ; NORMAL-NEXT:    selp.f32 %f7, 0f40E00000, %f6, %p1;
-; NORMAL-NEXT:    st.param.f32 [func_retval0], %f7;
+; NORMAL-NEXT:    st.param.b32 [func_retval0], %f7;
 ; NORMAL-NEXT:    ret;
   %r = frem float 7.0, %a
   ret float %r
diff --git a/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll b/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll
index 6f1532708f289..e06cf0fc4d48c 100644
--- a/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll
+++ b/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll
@@ -13,9 +13,9 @@ define i32 @fshr_clamp_r(i32 %hi, i32 %lo, i32 %n) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [fshr_clamp_r_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [fshr_clamp_r_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [fshr_clamp_r_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [fshr_clamp_r_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fshr_clamp_r_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [fshr_clamp_r_param_2];
 ; CHECK-NEXT:    shf.r.clamp.b32 %r4, %r2, %r1, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -29,9 +29,9 @@ define i32 @fshl_clamp_r(i32 %hi, i32 %lo, i32 %n) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [fshl_clamp_r_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [fshl_clamp_r_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [fshl_clamp_r_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [fshl_clamp_r_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fshl_clamp_r_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [fshl_clamp_r_param_2];
 ; CHECK-NEXT:    shf.l.clamp.b32 %r4, %r2, %r1, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -45,8 +45,8 @@ define i32 @fshr_clamp_i(i32 %hi, i32 %lo) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [fshr_clamp_i_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [fshr_clamp_i_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [fshr_clamp_i_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fshr_clamp_i_param_1];
 ; CHECK-NEXT:    shf.r.clamp.b32 %r3, %r2, %r1, 3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -60,8 +60,8 @@ define i32 @fshl_clamp_i(i32 %hi, i32 %lo) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [fshl_clamp_i_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [fshl_clamp_i_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [fshl_clamp_i_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fshl_clamp_i_param_1];
 ; CHECK-NEXT:    shf.l.clamp.b32 %r3, %r2, %r1, 3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
index 2b6631154e387..04d8dbfcafb31 100644
--- a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
+++ b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll
@@ -15,9 +15,9 @@ target triple = "nvptx-nvidia-cuda"
 define ptx_kernel void @foo(ptr %a, ptr %b) {
 ; Expect one load -- @myconst isn't loaded from, because we know its value
 ; statically.
-; CHECK: ld.global.u32
-; CHECK: st.global.u32
-; CHECK: st.global.u32
+; CHECK: ld.global.b32
+; CHECK: st.global.b32
+; CHECK: st.global.b32
   %ld1 = load i32, ptr @myglobal
   %ld2 = load i32, ptr @myconst
   store i32 %ld1, ptr %a
diff --git a/llvm/test/CodeGen/NVPTX/globals_lowering.ll b/llvm/test/CodeGen/NVPTX/globals_lowering.ll
index b0b7aeb0900ac..d94e47fe3ba6b 100644
--- a/llvm/test/CodeGen/NVPTX/globals_lowering.ll
+++ b/llvm/test/CodeGen/NVPTX/globals_lowering.ll
@@ -7,10 +7,10 @@
 ; CHK-LABEL: foo
 define void @foo(float %f) {
 entry:
-  ; CHK: ld.shared.f32  %{{[a-zA-Z0-9]+}}, [Gbl+8];
+  ; CHK: ld.shared.b32  %{{[a-zA-Z0-9]+}}, [Gbl+8];
   %0 = load float, ptr addrspace(3) getelementptr inbounds ([1024 x %MyStruct], ptr addrspace(3) @Gbl, i32 0, i32 0, i32 2)
   %add = fadd float %0, %f
-  ; CHK: st.shared.f32   [Gbl+8], %{{[a-zA-Z0-9]+}};
+  ; CHK: st.shared.b32   [Gbl+8], %{{[a-zA-Z0-9]+}};
   store float %add, ptr addrspace(3) getelementptr inbounds ([1024 x %MyStruct], ptr addrspace(3) @Gbl, i32 0, i32 0, i32 2)
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/half.ll b/llvm/test/CodeGen/NVPTX/half.ll
index 1b53e246ecd17..a3ccf6e565b40 100644
--- a/llvm/test/CodeGen/NVPTX/half.ll
+++ b/llvm/test/CodeGen/NVPTX/half.ll
@@ -26,8 +26,8 @@ define void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out)
 
 define void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; CHECK-LABEL: @test_bitcast_to_half
-; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
-; CHECK: st.global.u16 [{{%rd[0-9]+}}], [[TMP]]
+; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
+; CHECK: st.global.b16 [{{%rd[0-9]+}}], [[TMP]]
   %val = load i16, ptr addrspace(1) %in
   %val_fp = bitcast i16 %val to half
   store half %val_fp, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
index 44ac46db254a7..bb88d1f2755ca 100644
--- a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
@@ -11,15 +11,15 @@ define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
 ; CHECK:    .reg .b32 %r<4>;
 ; CHECK:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
-; CHECK:    ld.param.u64 %rd1, [foo_param_0];
+; CHECK:    ld.param.b64 %rd1, [foo_param_0];
 ; CHECK:    cvta.to.global.u64 %rd2, %rd1;
-; CHECK:    ld.param.u64 %rd3, [foo_param_1];
+; CHECK:    ld.param.b64 %rd3, [foo_param_1];
 ; CHECK:    cvta.to.global.u64 %rd4, %rd3;
-; CHECK:    ld.global.nc.u8 %rs1, [%rd2];
+; CHECK:    ld.global.nc.b8 %rs1, [%rd2];
 ; CHECK:    cvt.u32.u8 %r1, %rs1;
 ; CHECK:    add.s32 %r2, %r1, 1;
 ; CHECK:    and.b32 %r3, %r2, 1;
-; CHECK:    st.global.u32 [%rd4], %r3;
+; CHECK:    st.global.b32 [%rd4], %r3;
 ; CHECK:    ret;
   %ld = load i1, ptr %ptr, align 1
   %zext = zext i1 %ld to i32
diff --git a/llvm/test/CodeGen/NVPTX/i1-icmp.ll b/llvm/test/CodeGen/NVPTX/i1-icmp.ll
index 620f09653c951..e43a9da88a50e 100644
--- a/llvm/test/CodeGen/NVPTX/i1-icmp.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-icmp.ll
@@ -11,9 +11,9 @@ define i32 @icmp_i1_eq(i32 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [icmp_i1_eq_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_i1_eq_param_0];
 ; CHECK-NEXT:    setp.gt.s32 %p1, %r1, 1;
-; CHECK-NEXT:    ld.param.u32 %r2, [icmp_i1_eq_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_i1_eq_param_1];
 ; CHECK-NEXT:    setp.gt.s32 %p2, %r2, 1;
 ; CHECK-NEXT:    xor.pred %p3, %p1, %p2;
 ; CHECK-NEXT:    @%p3 bra $L__BB0_2;
@@ -42,9 +42,9 @@ define i32 @icmp_i1_ne(i32 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [icmp_i1_ne_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_i1_ne_param_0];
 ; CHECK-NEXT:    setp.gt.s32 %p1, %r1, 1;
-; CHECK-NEXT:    ld.param.u32 %r2, [icmp_i1_ne_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_i1_ne_param_1];
 ; CHECK-NEXT:    setp.gt.s32 %p2, %r2, 1;
 ; CHECK-NEXT:    xor.pred %p3, %p1, %p2;
 ; CHECK-NEXT:    not.pred %p4, %p3;
@@ -74,9 +74,9 @@ define i32 @icmp_i1_sgt(i32 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [icmp_i1_sgt_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_i1_sgt_param_0];
 ; CHECK-NEXT:    setp.gt.s32 %p1, %r1, 1;
-; CHECK-NEXT:    ld.param.u32 %r2, [icmp_i1_sgt_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_i1_sgt_param_1];
 ; CHECK-NEXT:    setp.lt.s32 %p2, %r2, 2;
 ; CHECK-NEXT:    or.pred %p3, %p1, %p2;
 ; CHECK-NEXT:    @%p3 bra $L__BB2_2;
@@ -105,9 +105,9 @@ define i32 @icmp_i1_slt(i32 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [icmp_i1_slt_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_i1_slt_param_0];
 ; CHECK-NEXT:    setp.lt.s32 %p1, %r1, 2;
-; CHECK-NEXT:    ld.param.u32 %r2, [icmp_i1_slt_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_i1_slt_param_1];
 ; CHECK-NEXT:    setp.gt.s32 %p2, %r2, 1;
 ; CHECK-NEXT:    or.pred %p3, %p2, %p1;
 ; CHECK-NEXT:    @%p3 bra $L__BB3_2;
@@ -136,9 +136,9 @@ define i32 @icmp_i1_sge(i32 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [icmp_i1_sge_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_i1_sge_param_0];
 ; CHECK-NEXT:    setp.gt.s32 %p1, %r1, 1;
-; CHECK-NEXT:    ld.param.u32 %r2, [icmp_i1_sge_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_i1_sge_param_1];
 ; CHECK-NEXT:    setp.lt.s32 %p2, %r2, 2;
 ; CHECK-NEXT:    and.pred %p3, %p1, %p2;
 ; CHECK-NEXT:    @%p3 bra $L__BB4_2;
@@ -167,9 +167,9 @@ define i32 @icmp_i1_sle(i32 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [icmp_i1_sle_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_i1_sle_param_0];
 ; CHECK-NEXT:    setp.lt.s32 %p1, %r1, 2;
-; CHECK-NEXT:    ld.param.u32 %r2, [icmp_i1_sle_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_i1_sle_param_1];
 ; CHECK-NEXT:    setp.gt.s32 %p2, %r2, 1;
 ; CHECK-NEXT:    and.pred %p3, %p2, %p1;
 ; CHECK-NEXT:    @%p3 bra $L__BB5_2;
@@ -198,9 +198,9 @@ define i32 @icmp_i1_uge(i32 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [icmp_i1_uge_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_i1_uge_param_0];
 ; CHECK-NEXT:    setp.lt.s32 %p1, %r1, 2;
-; CHECK-NEXT:    ld.param.u32 %r2, [icmp_i1_uge_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_i1_uge_param_1];
 ; CHECK-NEXT:    setp.gt.s32 %p2, %r2, 1;
 ; CHECK-NEXT:    and.pred %p3, %p2, %p1;
 ; CHECK-NEXT:    @%p3 bra $L__BB6_2;
@@ -229,9 +229,9 @@ define i32 @icmp_i1_ugt(i32 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [icmp_i1_ugt_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_i1_ugt_param_0];
 ; CHECK-NEXT:    setp.lt.s32 %p1, %r1, 2;
-; CHECK-NEXT:    ld.param.u32 %r2, [icmp_i1_ugt_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_i1_ugt_param_1];
 ; CHECK-NEXT:    setp.gt.s32 %p2, %r2, 1;
 ; CHECK-NEXT:    or.pred %p3, %p2, %p1;
 ; CHECK-NEXT:    @%p3 bra $L__BB7_2;
@@ -260,9 +260,9 @@ define i32 @icmp_i1_ule(i32 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [icmp_i1_ule_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_i1_ule_param_0];
 ; CHECK-NEXT:    setp.gt.s32 %p1, %r1, 1;
-; CHECK-NEXT:    ld.param.u32 %r2, [icmp_i1_ule_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_i1_ule_param_1];
 ; CHECK-NEXT:    setp.lt.s32 %p2, %r2, 2;
 ; CHECK-NEXT:    and.pred %p3, %p1, %p2;
 ; CHECK-NEXT:    @%p3 bra $L__BB8_2;
@@ -291,9 +291,9 @@ define i32 @icmp_i1_ult(i32 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [icmp_i1_ult_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [icmp_i1_ult_param_0];
 ; CHECK-NEXT:    setp.gt.s32 %p1, %r1, 1;
-; CHECK-NEXT:    ld.param.u32 %r2, [icmp_i1_ult_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [icmp_i1_ult_param_1];
 ; CHECK-NEXT:    setp.lt.s32 %p2, %r2, 2;
 ; CHECK-NEXT:    or.pred %p3, %p1, %p2;
 ; CHECK-NEXT:    @%p3 bra $L__BB9_2;
diff --git a/llvm/test/CodeGen/NVPTX/i1-load-lower.ll b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll
index 84fd8226bb608..50d39c88a46b9 100644
--- a/llvm/test/CodeGen/NVPTX/i1-load-lower.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll
@@ -12,12 +12,12 @@ define void @foo() {
 ; CHECK:    .reg .pred %p<2>;
 ; CHECK:    .reg .b16 %rs<4>;
 ; CHECK-EMPTY:
-; CHECK:    ld.global.u8 %rs1, [i1g];
+; CHECK:    ld.global.b8 %rs1, [i1g];
 ; CHECK:    and.b16 %rs2, %rs1, 1;
 ; CHECK:    setp.ne.b16 %p1, %rs2, 0;
 ; CHECK:    @%p1 bra $L__BB0_2;
 ; CHECK:    mov.b16 %rs3, 1;
-; CHECK:    st.global.u8 [i1g], %rs3;
+; CHECK:    st.global.b8 [i1g], %rs3;
 ; CHECK:    ret;
   %tmp = load i1, ptr addrspace(1) @i1g, align 2
   br i1 %tmp, label %if.end, label %if.then
diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll
index d24b06c4d721c..6fb5aad4b1eb9 100644
--- a/llvm/test/CodeGen/NVPTX/i1-select.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-select.ll
@@ -11,16 +11,16 @@ define i32 @test_select_i1_trunc(i32 %a, i32 %b, i32 %c, i32 %true, i32 %false)
 ; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_select_i1_trunc_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_select_i1_trunc_param_0];
 ; CHECK-NEXT:    and.b32 %r2, %r1, 1;
 ; CHECK-NEXT:    setp.ne.b32 %p1, %r2, 0;
-; CHECK-NEXT:    ld.param.u32 %r3, [test_select_i1_trunc_param_1];
-; CHECK-NEXT:    ld.param.u32 %r4, [test_select_i1_trunc_param_2];
-; CHECK-NEXT:    ld.param.u32 %r5, [test_select_i1_trunc_param_3];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_select_i1_trunc_param_1];
+; CHECK-NEXT:    ld.param.b32 %r4, [test_select_i1_trunc_param_2];
+; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_trunc_param_3];
 ; CHECK-NEXT:    selp.b32 %r6, %r3, %r4, %p1;
 ; CHECK-NEXT:    and.b32 %r7, %r6, 1;
 ; CHECK-NEXT:    setp.ne.b32 %p2, %r7, 0;
-; CHECK-NEXT:    ld.param.u32 %r8, [test_select_i1_trunc_param_4];
+; CHECK-NEXT:    ld.param.b32 %r8, [test_select_i1_trunc_param_4];
 ; CHECK-NEXT:    selp.b32 %r9, %r5, %r8, %p2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r9;
 ; CHECK-NEXT:    ret;
@@ -41,16 +41,16 @@ define i32 @test_select_i1_trunc_2(i64 %a, i16 %b, i32 %c, i32 %true, i32 %false
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_select_i1_trunc_2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_select_i1_trunc_2_param_0];
 ; CHECK-NEXT:    and.b64 %rd2, %rd1, 1;
 ; CHECK-NEXT:    setp.ne.b64 %p1, %rd2, 0;
-; CHECK-NEXT:    ld.param.u16 %rs1, [test_select_i1_trunc_2_param_1];
-; CHECK-NEXT:    ld.param.u16 %rs2, [test_select_i1_trunc_2_param_2];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_select_i1_trunc_2_param_3];
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_select_i1_trunc_2_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs2, [test_select_i1_trunc_2_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_select_i1_trunc_2_param_3];
 ; CHECK-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
 ; CHECK-NEXT:    and.b16 %rs4, %rs3, 1;
 ; CHECK-NEXT:    setp.ne.b16 %p2, %rs4, 0;
-; CHECK-NEXT:    ld.param.u32 %r2, [test_select_i1_trunc_2_param_4];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_i1_trunc_2_param_4];
 ; CHECK-NEXT:    selp.b32 %r3, %r1, %r2, %p2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -69,15 +69,15 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
 ; CHECK-NEXT:    .reg .b32 %r<12>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_select_i1_basic_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_select_i1_basic_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_select_i1_basic_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_i1_basic_param_1];
 ; CHECK-NEXT:    or.b32 %r4, %r1, %r2;
 ; CHECK-NEXT:    setp.ne.s32 %p1, %r1, 0;
-; CHECK-NEXT:    ld.param.u32 %r5, [test_select_i1_basic_param_2];
+; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_basic_param_2];
 ; CHECK-NEXT:    setp.eq.s32 %p2, %r5, 0;
-; CHECK-NEXT:    ld.param.u32 %r7, [test_select_i1_basic_param_3];
+; CHECK-NEXT:    ld.param.b32 %r7, [test_select_i1_basic_param_3];
 ; CHECK-NEXT:    setp.eq.s32 %p3, %r4, 0;
-; CHECK-NEXT:    ld.param.u32 %r8, [test_select_i1_basic_param_4];
+; CHECK-NEXT:    ld.param.b32 %r8, [test_select_i1_basic_param_4];
 ; CHECK-NEXT:    selp.b32 %r9, %r7, %r8, %p2;
 ; CHECK-NEXT:    selp.b32 %r10, %r9, %r8, %p1;
 ; CHECK-NEXT:    selp.b32 %r11, %r7, %r10, %p3;
@@ -98,16 +98,16 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_select_i1_basic_folding_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_select_i1_basic_folding_param_0];
 ; CHECK-NEXT:    setp.eq.s32 %p1, %r1, 0;
-; CHECK-NEXT:    ld.param.u32 %r2, [test_select_i1_basic_folding_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_i1_basic_folding_param_1];
 ; CHECK-NEXT:    setp.ne.s32 %p2, %r2, 0;
 ; CHECK-NEXT:    setp.eq.s32 %p3, %r2, 0;
-; CHECK-NEXT:    ld.param.u32 %r3, [test_select_i1_basic_folding_param_2];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_select_i1_basic_folding_param_2];
 ; CHECK-NEXT:    setp.eq.s32 %p4, %r3, 0;
-; CHECK-NEXT:    ld.param.u32 %r4, [test_select_i1_basic_folding_param_3];
+; CHECK-NEXT:    ld.param.b32 %r4, [test_select_i1_basic_folding_param_3];
 ; CHECK-NEXT:    xor.pred %p6, %p1, %p3;
-; CHECK-NEXT:    ld.param.u32 %r5, [test_select_i1_basic_folding_param_4];
+; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_basic_folding_param_4];
 ; CHECK-NEXT:    and.pred %p7, %p6, %p4;
 ; CHECK-NEXT:    and.pred %p9, %p2, %p4;
 ; CHECK-NEXT:    and.pred %p10, %p3, %p7;
diff --git a/llvm/test/CodeGen/NVPTX/i128-array.ll b/llvm/test/CodeGen/NVPTX/i128-array.ll
index dd6d48bd5862c..3bb9c6aec51ac 100644
--- a/llvm/test/CodeGen/NVPTX/i128-array.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-array.ll
@@ -8,8 +8,8 @@ define [2 x i128] @foo(i64 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [foo_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd1, [foo_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [foo_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [foo_param_0];
 ; CHECK-NEXT:    shr.s64 %rd2, %rd1, 63;
 ; CHECK-NEXT:    cvt.s64.s32 %rd3, %r1;
 ; CHECK-NEXT:    shr.s64 %rd4, %rd3, 63;
@@ -30,8 +30,8 @@ define [2 x i128] @foo2(ptr byval([2 x i128]) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd3, %rd4}, [foo2_param_0];
-; CHECK-NEXT:    ld.param.v2.u64 {%rd5, %rd6}, [foo2_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [foo2_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [foo2_param_0+16];
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd4};
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd5, %rd6};
 ; CHECK-NEXT:    ret;
@@ -51,8 +51,8 @@ define [2 x i128] @foo3([2 x i128] %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd3, %rd4}, [foo3_param_0+16];
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [foo3_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [foo3_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [foo3_param_0];
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/i128-param.ll b/llvm/test/CodeGen/NVPTX/i128-param.ll
index 849d96f2cf3b6..4f4c2fe73ba7f 100644
--- a/llvm/test/CodeGen/NVPTX/i128-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-param.ll
@@ -5,8 +5,8 @@
 ; CHECK-NEXT: .param .align 16 .b8 callee_param_0[16],
 ; CHECK-NEXT: .param .align 16 .b8 callee_param_1[16],
 define void @callee(i128, i128, ptr) {
-  ; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];
-  ; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [callee_param_1];
+  ; CHECK-DAG: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];
+  ; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [callee_param_1];
 
   ; CHECK:      mul.lo.s64 %[[REG4:rd[0-9]+]], %[[REG0]], %[[REG3]];
 	; CHECK-NEXT: mul.hi.u64 %[[REG5:rd[0-9]+]], %[[REG0]], %[[REG2]];
@@ -25,8 +25,8 @@ define void @callee(i128, i128, ptr) {
 ; CHECK-NEXT: .param .align 16 .b8 caller_kernel_param_1[16],
 define ptx_kernel void @caller_kernel(i128, i128, ptr) {
 start:
-  ; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_kernel_param_0];
-  ; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_kernel_param_1];
+  ; CHECK-DAG: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_kernel_param_0];
+  ; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_kernel_param_1];
 
   ; CHECK:      { // callseq [[CALLSEQ_ID:[0-9]]], 0
 	; CHECK:      .param .align 16 .b8 param0[16];
@@ -44,8 +44,8 @@ start:
 ; CHECK-NEXT: .param .align 16 .b8 caller_func_param_1[16],
 define void @caller_func(i128, i128, ptr) {
 start:
-  ; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_func_param_0]
-  ; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_func_param_1]
+  ; CHECK-DAG: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_func_param_0]
+  ; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_func_param_1]
 
   ; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0
 	; CHECK: .param .align 16 .b8 param0[16];
diff --git a/llvm/test/CodeGen/NVPTX/i128-retval.ll b/llvm/test/CodeGen/NVPTX/i128-retval.ll
index a01d14d5ca776..7fea1c43aad27 100644
--- a/llvm/test/CodeGen/NVPTX/i128-retval.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-retval.ll
@@ -3,7 +3,7 @@
 
 ; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[16]) callee(
 define i128 @callee(i128) {
-  ; CHECK: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];
+  ; CHECK: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];
   ; CHECK: st.param.v2.b64 [func_retval0], {%[[REG0]], %[[REG1]]}
   ret i128 %0
 }
@@ -11,8 +11,8 @@ define i128 @callee(i128) {
 ; CHECK-LABEL: .visible .func caller(
 define void @caller(i128, ptr) {
 start:
-  ; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_param_0];
-  ; CHECK-DAG: ld.param.u64 %[[OUT:rd[0-9]+]],  [caller_param_1];
+  ; CHECK-DAG: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_param_0];
+  ; CHECK-DAG: ld.param.b64 %[[OUT:rd[0-9]+]],  [caller_param_1];
 
   ; CHECK: { // callseq 0, 0
 	; CHECK: .param .align 16 .b8 retval0[16];
@@ -21,7 +21,7 @@ start:
 	; CHECK: } // callseq 0
   %a = call i128 @callee(i128 %0)
 
-	; CHECK-DAG: st.v2.u64 [%[[OUT]]], {%[[REG2]], %[[REG3]]};
+	; CHECK-DAG: st.v2.b64 [%[[OUT]]], {%[[REG2]], %[[REG3]]};
   store i128 %a, ptr %1
 
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index 64786e601c4b5..6b82a688d427d 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -10,8 +10,8 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    .reg .b64 %rd<127>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT:    ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0];
-; CHECK-NEXT:    ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd49, %rd50}, [srem_i128_param_1];
 ; CHECK-NEXT:    shr.s64 %rd2, %rd46, 63;
 ; CHECK-NEXT:    mov.b64 %rd117, 0;
 ; CHECK-NEXT:    sub.cc.s64 %rd52, %rd117, %rd45;
@@ -151,8 +151,8 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    .reg .b64 %rd<113>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT:    ld.param.v2.u64 {%rd41, %rd42}, [urem_i128_param_0];
-; CHECK-NEXT:    ld.param.v2.u64 {%rd3, %rd4}, [urem_i128_param_1];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [urem_i128_param_1];
 ; CHECK-NEXT:    or.b64 %rd45, %rd3, %rd4;
 ; CHECK-NEXT:    setp.eq.s64 %p1, %rd45, 0;
 ; CHECK-NEXT:    or.b64 %rd46, %rd41, %rd42;
@@ -275,7 +275,7 @@ define i128 @srem_i128_pow2k(i128 %lhs) {
 ; CHECK-NEXT:    .reg .b64 %rd<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [srem_i128_pow2k_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [srem_i128_pow2k_param_0];
 ; CHECK-NEXT:    shr.s64 %rd3, %rd2, 63;
 ; CHECK-NEXT:    shr.u64 %rd4, %rd3, 31;
 ; CHECK-NEXT:    add.cc.s64 %rd5, %rd1, %rd4;
@@ -295,7 +295,7 @@ define i128 @urem_i128_pow2k(i128 %lhs) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [urem_i128_pow2k_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [urem_i128_pow2k_param_0];
 ; CHECK-NEXT:    and.b64 %rd3, %rd1, 8589934591;
 ; CHECK-NEXT:    mov.b64 %rd4, 0;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd4};
@@ -312,8 +312,8 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    .reg .b64 %rd<122>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT:    ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0];
-; CHECK-NEXT:    ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd49, %rd50}, [sdiv_i128_param_1];
 ; CHECK-NEXT:    mov.b64 %rd112, 0;
 ; CHECK-NEXT:    sub.cc.s64 %rd52, %rd112, %rd45;
 ; CHECK-NEXT:    subc.cc.s64 %rd53, %rd112, %rd46;
@@ -448,8 +448,8 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    .reg .b64 %rd<107>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT:    ld.param.v2.u64 {%rd41, %rd42}, [udiv_i128_param_0];
-; CHECK-NEXT:    ld.param.v2.u64 {%rd43, %rd44}, [udiv_i128_param_1];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd43, %rd44}, [udiv_i128_param_1];
 ; CHECK-NEXT:    or.b64 %rd45, %rd43, %rd44;
 ; CHECK-NEXT:    setp.eq.s64 %p1, %rd45, 0;
 ; CHECK-NEXT:    or.b64 %rd46, %rd41, %rd42;
@@ -566,7 +566,7 @@ define i128 @sdiv_i128_pow2k(i128 %lhs) {
 ; CHECK-NEXT:    .reg .b64 %rd<11>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [sdiv_i128_pow2k_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [sdiv_i128_pow2k_param_0];
 ; CHECK-NEXT:    shr.s64 %rd3, %rd2, 63;
 ; CHECK-NEXT:    shr.u64 %rd4, %rd3, 31;
 ; CHECK-NEXT:    add.cc.s64 %rd5, %rd1, %rd4;
@@ -587,7 +587,7 @@ define i128 @udiv_i128_pow2k(i128 %lhs) {
 ; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [udiv_i128_pow2k_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [udiv_i128_pow2k_param_0];
 ; CHECK-NEXT:    shl.b64 %rd3, %rd2, 31;
 ; CHECK-NEXT:    shr.u64 %rd4, %rd1, 33;
 ; CHECK-NEXT:    or.b64 %rd5, %rd4, %rd3;
@@ -604,8 +604,8 @@ define i128 @add_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [add_i128_param_0];
-; CHECK-NEXT:    ld.param.v2.u64 {%rd3, %rd4}, [add_i128_param_1];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [add_i128_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [add_i128_param_1];
 ; CHECK-NEXT:    add.cc.s64 %rd5, %rd1, %rd3;
 ; CHECK-NEXT:    addc.cc.s64 %rd6, %rd2, %rd4;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd5, %rd6};
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index 1fd044cd2efc0..5bfa5b2bc63a5 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -39,7 +39,7 @@ define i16 @test_extract_0(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r1, [test_extract_0_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_extract_0_param_0];
 ; I16x2-NEXT:    mov.b32 {%rs1, _}, %r1;
 ; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
 ; COMMON-NEXT:    cvt.u32.u16 %r2, %rs1;
@@ -56,7 +56,7 @@ define i16 @test_extract_1(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r1, [test_extract_1_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_extract_1_param_0];
 ; I16x2-NEXT:    mov.b32 {_, %rs1}, %r1;
 ; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
 ; COMMON-NEXT:    cvt.u32.u16 %r2, %rs1;
@@ -75,8 +75,8 @@ define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 {
 ; COMMON-NEXT:    .reg .b64 %rd<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u64 %rd1, [test_extract_i_param_1];
-; COMMON-NEXT:    ld.param.u32 %r1, [test_extract_i_param_0];
+; COMMON-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
 ; COMMON-NEXT:    setp.eq.s64 %p1, %rd1, 0;
 ; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
@@ -93,8 +93,8 @@ define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 {
 ; I16x2-NEXT:    .reg .b32 %r<4>;
 ; I16x2-EMPTY:
 ; I16x2-NEXT:  // %bb.0:
-; I16x2-NEXT:    ld.param.u32 %r2, [test_add_param_1];
-; I16x2-NEXT:    ld.param.u32 %r1, [test_add_param_0];
+; I16x2-NEXT:    ld.param.b32 %r2, [test_add_param_1];
+; I16x2-NEXT:    ld.param.b32 %r1, [test_add_param_0];
 ; I16x2-NEXT:    add.s16x2 %r3, %r1, %r2;
 ; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
 ; I16x2-NEXT:    ret;
@@ -105,8 +105,8 @@ define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<4>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_add_param_1];
-; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_add_param_0];
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_add_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_add_param_0];
 ; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
 ; NO-I16x2-NEXT:    add.s16 %rs5, %rs4, %rs2;
@@ -125,7 +125,7 @@ define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 {
 ; I16x2-NEXT:    .reg .b32 %r<4>;
 ; I16x2-EMPTY:
 ; I16x2-NEXT:  // %bb.0:
-; I16x2-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
+; I16x2-NEXT:    ld.param.b32 %r1, [test_add_imm_0_param_0];
 ; I16x2-NEXT:    mov.b32 %r2, 131073;
 ; I16x2-NEXT:    add.s16x2 %r3, %r1, %r2;
 ; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -137,7 +137,7 @@ define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_add_imm_0_param_0];
 ; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; NO-I16x2-NEXT:    add.s16 %rs3, %rs2, 2;
 ; NO-I16x2-NEXT:    add.s16 %rs4, %rs1, 1;
@@ -154,7 +154,7 @@ define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 {
 ; I16x2-NEXT:    .reg .b32 %r<4>;
 ; I16x2-EMPTY:
 ; I16x2-NEXT:  // %bb.0:
-; I16x2-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
+; I16x2-NEXT:    ld.param.b32 %r1, [test_add_imm_1_param_0];
 ; I16x2-NEXT:    mov.b32 %r2, 131073;
 ; I16x2-NEXT:    add.s16x2 %r3, %r1, %r2;
 ; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -166,7 +166,7 @@ define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_add_imm_1_param_0];
 ; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; NO-I16x2-NEXT:    add.s16 %rs3, %rs2, 2;
 ; NO-I16x2-NEXT:    add.s16 %rs4, %rs1, 1;
@@ -184,8 +184,8 @@ define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r2, [test_sub_param_1];
-; COMMON-NEXT:    ld.param.u32 %r1, [test_sub_param_0];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_sub_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_sub_param_0];
 ; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
 ; COMMON-NEXT:    sub.s16 %rs5, %rs4, %rs2;
@@ -203,8 +203,8 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 {
 ; I16x2-NEXT:    .reg .b32 %r<4>;
 ; I16x2-EMPTY:
 ; I16x2-NEXT:  // %bb.0:
-; I16x2-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
-; I16x2-NEXT:    ld.param.u32 %r1, [test_smax_param_0];
+; I16x2-NEXT:    ld.param.b32 %r2, [test_smax_param_1];
+; I16x2-NEXT:    ld.param.b32 %r1, [test_smax_param_0];
 ; I16x2-NEXT:    max.s16x2 %r3, %r1, %r2;
 ; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
 ; I16x2-NEXT:    ret;
@@ -215,8 +215,8 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<4>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
-; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_smax_param_0];
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_smax_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_smax_param_0];
 ; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
 ; NO-I16x2-NEXT:    max.s16 %rs5, %rs4, %rs2;
@@ -235,8 +235,8 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 {
 ; I16x2-NEXT:    .reg .b32 %r<4>;
 ; I16x2-EMPTY:
 ; I16x2-NEXT:  // %bb.0:
-; I16x2-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
-; I16x2-NEXT:    ld.param.u32 %r1, [test_umax_param_0];
+; I16x2-NEXT:    ld.param.b32 %r2, [test_umax_param_1];
+; I16x2-NEXT:    ld.param.b32 %r1, [test_umax_param_0];
 ; I16x2-NEXT:    max.u16x2 %r3, %r1, %r2;
 ; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
 ; I16x2-NEXT:    ret;
@@ -247,8 +247,8 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<4>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
-; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_umax_param_0];
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_umax_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_umax_param_0];
 ; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
 ; NO-I16x2-NEXT:    max.u16 %rs5, %rs4, %rs2;
@@ -267,8 +267,8 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 {
 ; I16x2-NEXT:    .reg .b32 %r<4>;
 ; I16x2-EMPTY:
 ; I16x2-NEXT:  // %bb.0:
-; I16x2-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
-; I16x2-NEXT:    ld.param.u32 %r1, [test_smin_param_0];
+; I16x2-NEXT:    ld.param.b32 %r2, [test_smin_param_1];
+; I16x2-NEXT:    ld.param.b32 %r1, [test_smin_param_0];
 ; I16x2-NEXT:    min.s16x2 %r3, %r1, %r2;
 ; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
 ; I16x2-NEXT:    ret;
@@ -279,8 +279,8 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<4>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
-; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_smin_param_0];
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_smin_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_smin_param_0];
 ; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
 ; NO-I16x2-NEXT:    min.s16 %rs5, %rs4, %rs2;
@@ -299,8 +299,8 @@ define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 {
 ; I16x2-NEXT:    .reg .b32 %r<4>;
 ; I16x2-EMPTY:
 ; I16x2-NEXT:  // %bb.0:
-; I16x2-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
-; I16x2-NEXT:    ld.param.u32 %r1, [test_umin_param_0];
+; I16x2-NEXT:    ld.param.b32 %r2, [test_umin_param_1];
+; I16x2-NEXT:    ld.param.b32 %r1, [test_umin_param_0];
 ; I16x2-NEXT:    min.u16x2 %r3, %r1, %r2;
 ; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
 ; I16x2-NEXT:    ret;
@@ -311,8 +311,8 @@ define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<4>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
-; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_umin_param_0];
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_umin_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_umin_param_0];
 ; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
 ; NO-I16x2-NEXT:    min.u16 %rs5, %rs4, %rs2;
@@ -332,8 +332,8 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r2, [test_mul_param_1];
-; COMMON-NEXT:    ld.param.u32 %r1, [test_mul_param_0];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_mul_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_mul_param_0];
 ; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
 ; COMMON-NEXT:    mul.lo.s16 %rs5, %rs4, %rs2;
@@ -352,8 +352,8 @@ define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r2, [test_or_param_1];
-; COMMON-NEXT:    ld.param.u32 %r1, [test_or_param_0];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_or_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_or_param_0];
 ; COMMON-NEXT:    or.b32 %r3, %r1, %r2;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
 ; COMMON-NEXT:    ret;
@@ -370,7 +370,7 @@ define <2 x i16> @test_or_computed(i16 %a) {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u16 %rs1, [test_or_computed_param_0];
+; COMMON-NEXT:    ld.param.b16 %rs1, [test_or_computed_param_0];
 ; COMMON-NEXT:    mov.b16 %rs2, 0;
 ; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
 ; COMMON-NEXT:    mov.b16 %rs3, 5;
@@ -391,7 +391,7 @@ define <2 x i16> @test_or_imm_0(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r1, [test_or_imm_0_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_or_imm_0_param_0];
 ; COMMON-NEXT:    or.b32 %r2, %r1, 131073;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
 ; COMMON-NEXT:    ret;
@@ -405,7 +405,7 @@ define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r1, [test_or_imm_1_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_or_imm_1_param_0];
 ; COMMON-NEXT:    or.b32 %r2, %r1, 131073;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
 ; COMMON-NEXT:    ret;
@@ -419,8 +419,8 @@ define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r2, [test_xor_param_1];
-; COMMON-NEXT:    ld.param.u32 %r1, [test_xor_param_0];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_xor_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_xor_param_0];
 ; COMMON-NEXT:    xor.b32 %r3, %r1, %r2;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
 ; COMMON-NEXT:    ret;
@@ -435,7 +435,7 @@ define <2 x i16> @test_xor_computed(i16 %a) {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u16 %rs1, [test_xor_computed_param_0];
+; COMMON-NEXT:    ld.param.b16 %rs1, [test_xor_computed_param_0];
 ; COMMON-NEXT:    mov.b16 %rs2, 0;
 ; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
 ; COMMON-NEXT:    mov.b16 %rs3, 5;
@@ -456,7 +456,7 @@ define <2 x i16> @test_xor_imm_0(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r1, [test_xor_imm_0_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_xor_imm_0_param_0];
 ; COMMON-NEXT:    xor.b32 %r2, %r1, 131073;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
 ; COMMON-NEXT:    ret;
@@ -470,7 +470,7 @@ define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r1, [test_xor_imm_1_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_xor_imm_1_param_0];
 ; COMMON-NEXT:    xor.b32 %r2, %r1, 131073;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
 ; COMMON-NEXT:    ret;
@@ -484,8 +484,8 @@ define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r2, [test_and_param_1];
-; COMMON-NEXT:    ld.param.u32 %r1, [test_and_param_0];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_and_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_and_param_0];
 ; COMMON-NEXT:    and.b32 %r3, %r1, %r2;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
 ; COMMON-NEXT:    ret;
@@ -502,7 +502,7 @@ define <2 x i16> @test_and_computed(i16 %a) {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u16 %rs1, [test_and_computed_param_0];
+; COMMON-NEXT:    ld.param.b16 %rs1, [test_and_computed_param_0];
 ; COMMON-NEXT:    mov.b16 %rs2, 0;
 ; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
 ; COMMON-NEXT:    mov.b16 %rs3, 5;
@@ -523,7 +523,7 @@ define <2 x i16> @test_and_imm_0(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r1, [test_and_imm_0_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_and_imm_0_param_0];
 ; COMMON-NEXT:    and.b32 %r2, %r1, 131073;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
 ; COMMON-NEXT:    ret;
@@ -537,7 +537,7 @@ define <2 x i16> @test_and_imm_1(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r1, [test_and_imm_1_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_and_imm_1_param_0];
 ; COMMON-NEXT:    and.b32 %r2, %r1, 131073;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
 ; COMMON-NEXT:    ret;
@@ -552,10 +552,10 @@ define void @test_ldst_v2i16(ptr %a, ptr %b) {
 ; COMMON-NEXT:    .reg .b64 %rd<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u64 %rd2, [test_ldst_v2i16_param_1];
-; COMMON-NEXT:    ld.param.u64 %rd1, [test_ldst_v2i16_param_0];
-; COMMON-NEXT:    ld.u32 %r1, [%rd1];
-; COMMON-NEXT:    st.u32 [%rd2], %r1;
+; COMMON-NEXT:    ld.param.b64 %rd2, [test_ldst_v2i16_param_1];
+; COMMON-NEXT:    ld.param.b64 %rd1, [test_ldst_v2i16_param_0];
+; COMMON-NEXT:    ld.b32 %r1, [%rd1];
+; COMMON-NEXT:    st.b32 [%rd2], %r1;
 ; COMMON-NEXT:    ret;
   %t1 = load <2 x i16>, ptr %a
   store <2 x i16> %t1, ptr %b, align 16
@@ -572,12 +572,12 @@ define void @test_ldst_v3i16(ptr %a, ptr %b) {
 ; COMMON-NEXT:    .reg .b64 %rd<5>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u64 %rd2, [test_ldst_v3i16_param_1];
-; COMMON-NEXT:    ld.param.u64 %rd1, [test_ldst_v3i16_param_0];
-; COMMON-NEXT:    ld.u64 %rd3, [%rd1];
+; COMMON-NEXT:    ld.param.b64 %rd2, [test_ldst_v3i16_param_1];
+; COMMON-NEXT:    ld.param.b64 %rd1, [test_ldst_v3i16_param_0];
+; COMMON-NEXT:    ld.b64 %rd3, [%rd1];
 ; COMMON-NEXT:    shr.u64 %rd4, %rd3, 32;
-; COMMON-NEXT:    st.u32 [%rd2], %rd3;
-; COMMON-NEXT:    st.u16 [%rd2+4], %rd4;
+; COMMON-NEXT:    st.b32 [%rd2], %rd3;
+; COMMON-NEXT:    st.b16 [%rd2+4], %rd4;
 ; COMMON-NEXT:    ret;
   %t1 = load <3 x i16>, ptr %a
   store <3 x i16> %t1, ptr %b, align 16
@@ -591,10 +591,10 @@ define void @test_ldst_v4i16(ptr %a, ptr %b) {
 ; COMMON-NEXT:    .reg .b64 %rd<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u64 %rd2, [test_ldst_v4i16_param_1];
-; COMMON-NEXT:    ld.param.u64 %rd1, [test_ldst_v4i16_param_0];
-; COMMON-NEXT:    ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
-; COMMON-NEXT:    st.v4.u16 [%rd2], {%rs1, %rs2, %rs3, %rs4};
+; COMMON-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i16_param_1];
+; COMMON-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i16_param_0];
+; COMMON-NEXT:    ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; COMMON-NEXT:    st.v4.b16 [%rd2], {%rs1, %rs2, %rs3, %rs4};
 ; COMMON-NEXT:    ret;
   %t1 = load <4 x i16>, ptr %a
   store <4 x i16> %t1, ptr %b, align 16
@@ -608,8 +608,8 @@ define void @test_ldst_v8i16(ptr %a, ptr %b) {
 ; COMMON-NEXT:    .reg .b64 %rd<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u64 %rd2, [test_ldst_v8i16_param_1];
-; COMMON-NEXT:    ld.param.u64 %rd1, [test_ldst_v8i16_param_0];
+; COMMON-NEXT:    ld.param.b64 %rd2, [test_ldst_v8i16_param_1];
+; COMMON-NEXT:    ld.param.b64 %rd1, [test_ldst_v8i16_param_0];
 ; COMMON-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; COMMON-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
 ; COMMON-NEXT:    ret;
@@ -626,8 +626,8 @@ define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<5>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r2, [test_call_param_1];
-; COMMON-NEXT:    ld.param.u32 %r1, [test_call_param_0];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_call_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_call_param_0];
 ; COMMON-NEXT:    { // callseq 0, 0
 ; COMMON-NEXT:    .param .align 4 .b8 param0[4];
 ; COMMON-NEXT:    st.param.b32 [param0], %r1;
@@ -654,8 +654,8 @@ define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<5>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r2, [test_call_flipped_param_1];
-; COMMON-NEXT:    ld.param.u32 %r1, [test_call_flipped_param_0];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
 ; COMMON-NEXT:    { // callseq 1, 0
 ; COMMON-NEXT:    .param .align 4 .b8 param0[4];
 ; COMMON-NEXT:    st.param.b32 [param0], %r2;
@@ -682,8 +682,8 @@ define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<5>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r2, [test_tailcall_flipped_param_1];
-; COMMON-NEXT:    ld.param.u32 %r1, [test_tailcall_flipped_param_0];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
 ; COMMON-NEXT:    { // callseq 2, 0
 ; COMMON-NEXT:    .param .align 4 .b8 param0[4];
 ; COMMON-NEXT:    st.param.b32 [param0], %r2;
@@ -712,11 +712,11 @@ define <2 x i16> @test_select(<2 x i16> %a, <2 x i16> %b, i1 zeroext %c) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u8 %rs1, [test_select_param_2];
+; COMMON-NEXT:    ld.param.b8 %rs1, [test_select_param_2];
 ; COMMON-NEXT:    and.b16 %rs2, %rs1, 1;
 ; COMMON-NEXT:    setp.ne.b16 %p1, %rs2, 0;
-; COMMON-NEXT:    ld.param.u32 %r2, [test_select_param_1];
-; COMMON-NEXT:    ld.param.u32 %r1, [test_select_param_0];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_select_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_select_param_0];
 ; COMMON-NEXT:    selp.b32 %r3, %r1, %r2, %p1;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
 ; COMMON-NEXT:    ret;
@@ -732,10 +732,10 @@ define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x
 ; COMMON-NEXT:    .reg .b32 %r<6>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r4, [test_select_cc_param_3];
-; COMMON-NEXT:    ld.param.u32 %r3, [test_select_cc_param_2];
-; COMMON-NEXT:    ld.param.u32 %r2, [test_select_cc_param_1];
-; COMMON-NEXT:    ld.param.u32 %r1, [test_select_cc_param_0];
+; COMMON-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
+; COMMON-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
 ; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
 ; COMMON-NEXT:    setp.ne.s16 %p1, %rs3, %rs1;
@@ -760,10 +760,10 @@ define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b,
 ; COMMON-NEXT:    .reg .b32 %r<9>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i32_i16_param_1];
-; COMMON-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_select_cc_i32_i16_param_0];
-; COMMON-NEXT:    ld.param.u32 %r6, [test_select_cc_i32_i16_param_3];
-; COMMON-NEXT:    ld.param.u32 %r5, [test_select_cc_i32_i16_param_2];
+; COMMON-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i32_i16_param_1];
+; COMMON-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_i32_i16_param_0];
+; COMMON-NEXT:    ld.param.b32 %r6, [test_select_cc_i32_i16_param_3];
+; COMMON-NEXT:    ld.param.b32 %r5, [test_select_cc_i32_i16_param_2];
 ; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r6;
 ; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r5;
 ; COMMON-NEXT:    setp.ne.s16 %p1, %rs3, %rs1;
@@ -786,10 +786,10 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b,
 ; COMMON-NEXT:    .reg .b32 %r<8>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.u32 {%r5, %r6}, [test_select_cc_i16_i32_param_3];
-; COMMON-NEXT:    ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i16_i32_param_2];
-; COMMON-NEXT:    ld.param.u32 %r2, [test_select_cc_i16_i32_param_1];
-; COMMON-NEXT:    ld.param.u32 %r1, [test_select_cc_i16_i32_param_0];
+; COMMON-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_i16_i32_param_3];
+; COMMON-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i16_i32_param_2];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_select_cc_i16_i32_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_select_cc_i16_i32_param_0];
 ; COMMON-NEXT:    setp.ne.s32 %p1, %r3, %r5;
 ; COMMON-NEXT:    setp.ne.s32 %p2, %r4, %r6;
 ; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
@@ -812,7 +812,7 @@ define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_param_0];
+; COMMON-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_param_0];
 ; COMMON-NEXT:    prmt.b32 %r3, %r1, %r2, 0x5410U;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
 ; COMMON-NEXT:    ret;
@@ -827,12 +827,12 @@ define <2 x i16> @test_trunc_2xi32_muliple_use0(<2 x i32> %a, ptr %p) #0 {
 ; I16x2-NEXT:    .reg .b64 %rd<2>;
 ; I16x2-EMPTY:
 ; I16x2-NEXT:  // %bb.0:
-; I16x2-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0];
-; I16x2-NEXT:    ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use0_param_1];
+; I16x2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0];
+; I16x2-NEXT:    ld.param.b64 %rd1, [test_trunc_2xi32_muliple_use0_param_1];
 ; I16x2-NEXT:    prmt.b32 %r3, %r1, %r2, 0x5410U;
 ; I16x2-NEXT:    mov.b32 %r4, 65537;
 ; I16x2-NEXT:    add.s16x2 %r5, %r3, %r4;
-; I16x2-NEXT:    st.u32 [%rd1], %r5;
+; I16x2-NEXT:    st.b32 [%rd1], %r5;
 ; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
 ; I16x2-NEXT:    ret;
 ;
@@ -843,15 +843,15 @@ define <2 x i16> @test_trunc_2xi32_muliple_use0(<2 x i32> %a, ptr %p) #0 {
 ; NO-I16x2-NEXT:    .reg .b64 %rd<2>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0];
-; NO-I16x2-NEXT:    ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use0_param_1];
+; NO-I16x2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0];
+; NO-I16x2-NEXT:    ld.param.b64 %rd1, [test_trunc_2xi32_muliple_use0_param_1];
 ; NO-I16x2-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; NO-I16x2-NEXT:    cvt.u16.u32 %rs2, %r1;
 ; NO-I16x2-NEXT:    mov.b32 %r3, {%rs2, %rs1};
 ; NO-I16x2-NEXT:    add.s16 %rs3, %rs1, 1;
 ; NO-I16x2-NEXT:    add.s16 %rs4, %rs2, 1;
 ; NO-I16x2-NEXT:    mov.b32 %r4, {%rs4, %rs3};
-; NO-I16x2-NEXT:    st.u32 [%rd1], %r4;
+; NO-I16x2-NEXT:    st.b32 [%rd1], %r4;
 ; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
 ; NO-I16x2-NEXT:    ret;
   %r = trunc <2 x i32> %a to <2 x i16>
@@ -869,12 +869,12 @@ define <2 x i16> @test_trunc_2xi32_muliple_use1(<2 x i32> %a, ptr %p) #0 {
 ; COMMON-NEXT:    .reg .b64 %rd<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use1_param_0];
-; COMMON-NEXT:    ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use1_param_1];
+; COMMON-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_muliple_use1_param_0];
+; COMMON-NEXT:    ld.param.b64 %rd1, [test_trunc_2xi32_muliple_use1_param_1];
 ; COMMON-NEXT:    prmt.b32 %r3, %r1, %r2, 0x5410U;
 ; COMMON-NEXT:    add.s32 %r4, %r2, 1;
 ; COMMON-NEXT:    add.s32 %r5, %r1, 1;
-; COMMON-NEXT:    st.v2.u32 [%rd1], {%r5, %r4};
+; COMMON-NEXT:    st.v2.b32 [%rd1], {%r5, %r4};
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
 ; COMMON-NEXT:    ret;
   %r = trunc <2 x i32> %a to <2 x i16>
@@ -893,7 +893,7 @@ define <2 x i16> @test_trunc_2xi64(<2 x i64> %a) #0 {
 ; COMMON-NEXT:    .reg .b64 %rd<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
+; COMMON-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
 ; COMMON-NEXT:    cvt.u16.u64 %rs1, %rd2;
 ; COMMON-NEXT:    cvt.u16.u64 %rs2, %rd1;
 ; COMMON-NEXT:    mov.b32 %r1, {%rs2, %rs1};
@@ -910,7 +910,7 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r1, [test_zext_2xi32_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_zext_2xi32_param_0];
 ; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    cvt.u32.u16 %r2, %rs1;
 ; COMMON-NEXT:    cvt.u32.u16 %r3, %rs2;
@@ -928,7 +928,7 @@ define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b64 %rd<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r1, [test_zext_2xi64_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_zext_2xi64_param_0];
 ; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    cvt.u64.u16 %rd1, %rs2;
 ; COMMON-NEXT:    cvt.u64.u16 %rd2, %rs1;
@@ -944,7 +944,7 @@ define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r1, [test_bitcast_i32_to_2xi16_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_bitcast_i32_to_2xi16_param_0];
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
 ; COMMON-NEXT:    ret;
   %r = bitcast i32 %a to <2 x i16>
@@ -957,7 +957,7 @@ define i32 @test_bitcast_2xi16_to_i32(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r1, [test_bitcast_2xi16_to_i32_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_bitcast_2xi16_to_i32_param_0];
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
 ; COMMON-NEXT:    ret;
   %r = bitcast <2 x i16> %a to i32
@@ -971,7 +971,7 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(i16 %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u16 %rs1, [test_bitcast_2xi16_to_2xhalf_param_0];
+; COMMON-NEXT:    ld.param.b16 %rs1, [test_bitcast_2xi16_to_2xhalf_param_0];
 ; COMMON-NEXT:    mov.b16 %rs2, 5;
 ; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -990,7 +990,7 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r1, [test_shufflevector_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
 ; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    mov.b32 %r2, {%rs2, %rs1};
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
@@ -1006,8 +1006,8 @@ define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u16 %rs1, [test_insertelement_param_1];
-; COMMON-NEXT:    ld.param.u32 %r1, [test_insertelement_param_0];
+; COMMON-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
 ; I16x2-NEXT:    mov.b32 {%rs2, _}, %r1;
 ; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
 ; COMMON-NEXT:    mov.b32 %r2, {%rs2, %rs1};
diff --git a/llvm/test/CodeGen/NVPTX/i8-param.ll b/llvm/test/CodeGen/NVPTX/i8-param.ll
index 7c5134696c25d..0679ba0fe5727 100644
--- a/llvm/test/CodeGen/NVPTX/i8-param.ll
+++ b/llvm/test/CodeGen/NVPTX/i8-param.ll
@@ -5,7 +5,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ; CHECK: .visible .func  (.param .b32 func_retval0) callee
 define i8 @callee(i8 %a) {
-; CHECK: ld.param.u8
+; CHECK: ld.param.b8
   %ret = add i8 %a, 42
 ; CHECK: st.param.b32
   ret i8 %ret
@@ -13,7 +13,7 @@ define i8 @callee(i8 %a) {
 
 ; CHECK: .visible .func caller
 define void @caller(ptr %a) {
-; CHECK: ld.u8
+; CHECK: ld.b8
   %val = load i8, ptr %a
   %ret = tail call i8 @callee(i8 %val)
 ; CHECK: ld.param.b32
diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
index e9662dd8a7fa3..fe81134895926 100644
--- a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
@@ -17,7 +17,7 @@ define i16 @test_bitcast_2xi8_i16(<2 x i8> %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_2xi8_i16_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_2xi8_i16_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    shl.b16 %rs3, %rs2, 8;
 ; CHECK-NEXT:    and.b16 %rs4, %rs1, 255;
@@ -36,7 +36,7 @@ define <2 x i8> @test_bitcast_i16_2xi8(i16 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [test_bitcast_i16_2xi8_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_bitcast_i16_2xi8_param_0];
 ; CHECK-NEXT:    shr.u16 %rs2, %rs1, 8;
 ; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 65edcf2e07159..642d5d0e538a2 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -29,7 +29,7 @@ define i8 @test_extract_0(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_extract_0_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_0_param_0];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -43,7 +43,7 @@ define i8 @test_extract_1(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_extract_1_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_1_param_0];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 8, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -57,7 +57,7 @@ define i8 @test_extract_2(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_extract_2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_2_param_0];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 16, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -71,7 +71,7 @@ define i8 @test_extract_3(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_extract_3_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_3_param_0];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -86,8 +86,8 @@ define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_extract_i_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_extract_i_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
 ; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
 ; CHECK-NEXT:    shl.b32 %r3, %r2, 3;
 ; CHECK-NEXT:    bfe.u32 %r4, %r1, %r3, 8;
@@ -104,8 +104,8 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_add_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_add_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_add_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_add_param_0];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
 ; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
@@ -146,7 +146,7 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_add_imm_0_param_0];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 4;
@@ -179,7 +179,7 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_add_imm_1_param_0];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 4;
@@ -212,8 +212,8 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_sub_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_sub_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_sub_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sub_param_0];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
 ; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
@@ -254,8 +254,8 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<26>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_smax_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_smax_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_smax_param_0];
 ; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p1, %r4, %r3;
@@ -297,8 +297,8 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_umax_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_umax_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_umax_param_0];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
@@ -332,8 +332,8 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<26>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_smin_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_smin_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_smin_param_0];
 ; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    setp.le.s32 %p1, %r4, %r3;
@@ -375,8 +375,8 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_umin_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_umin_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_umin_param_0];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    setp.ls.u32 %p1, %r4, %r3;
@@ -410,9 +410,9 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<23>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r3, [test_eq_param_2];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_eq_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_eq_param_0];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_eq_param_2];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_eq_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_eq_param_0];
 ; CHECK-NEXT:    bfe.u32 %r4, %r2, 0, 8;
 ; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p1, %r5, %r4;
@@ -450,9 +450,9 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<23>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r3, [test_ne_param_2];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_ne_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_ne_param_0];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_ne_param_2];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_ne_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_ne_param_0];
 ; CHECK-NEXT:    bfe.u32 %r4, %r2, 0, 8;
 ; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r5, %r4;
@@ -490,8 +490,8 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_mul_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_mul_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_mul_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_mul_param_0];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
 ; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
@@ -531,8 +531,8 @@ define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_or_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_or_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_or_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_or_param_0];
 ; CHECK-NEXT:    or.b32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -547,7 +547,7 @@ define <4 x i8> @test_or_computed(i8 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [test_or_computed_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_or_computed_param_0];
 ; CHECK-NEXT:    mov.b32 %r1, 0;
 ; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
@@ -569,7 +569,7 @@ define <4 x i8> @test_or_imm_0(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_or_imm_0_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_or_imm_0_param_0];
 ; CHECK-NEXT:    or.b32 %r2, %r1, 67305985;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -583,7 +583,7 @@ define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_or_imm_1_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_or_imm_1_param_0];
 ; CHECK-NEXT:    or.b32 %r2, %r1, 67305985;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -597,8 +597,8 @@ define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_xor_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_xor_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_xor_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_xor_param_0];
 ; CHECK-NEXT:    xor.b32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -613,7 +613,7 @@ define <4 x i8> @test_xor_computed(i8 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [test_xor_computed_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_xor_computed_param_0];
 ; CHECK-NEXT:    mov.b32 %r1, 0;
 ; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
@@ -635,7 +635,7 @@ define <4 x i8> @test_xor_imm_0(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_xor_imm_0_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_xor_imm_0_param_0];
 ; CHECK-NEXT:    xor.b32 %r2, %r1, 67305985;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -649,7 +649,7 @@ define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_xor_imm_1_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_xor_imm_1_param_0];
 ; CHECK-NEXT:    xor.b32 %r2, %r1, 67305985;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -663,8 +663,8 @@ define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_and_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_and_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_and_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_and_param_0];
 ; CHECK-NEXT:    and.b32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -679,7 +679,7 @@ define <4 x i8> @test_and_computed(i8 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [test_and_computed_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_and_computed_param_0];
 ; CHECK-NEXT:    mov.b32 %r1, 0;
 ; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
@@ -701,7 +701,7 @@ define <4 x i8> @test_and_imm_0(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_and_imm_0_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_and_imm_0_param_0];
 ; CHECK-NEXT:    and.b32 %r2, %r1, 67305985;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -715,7 +715,7 @@ define <4 x i8> @test_and_imm_1(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_and_imm_1_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_and_imm_1_param_0];
 ; CHECK-NEXT:    and.b32 %r2, %r1, 67305985;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -730,10 +730,10 @@ define void @test_ldst_v2i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v2i8_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v2i8_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1];
-; CHECK-NEXT:    st.u32 [%rd2], %r1;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v2i8_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v2i8_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
+; CHECK-NEXT:    st.b32 [%rd2], %r1;
 ; CHECK-NEXT:    ret;
   %t1 = load <4 x i8>, ptr %a
   store <4 x i8> %t1, ptr %b, align 16
@@ -747,12 +747,12 @@ define void @test_ldst_v3i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v3i8_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v3i8_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1];
-; CHECK-NEXT:    st.u16 [%rd2], %r1;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v3i8_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v3i8_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
+; CHECK-NEXT:    st.b16 [%rd2], %r1;
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 16, 8;
-; CHECK-NEXT:    st.u8 [%rd2+2], %r2;
+; CHECK-NEXT:    st.b8 [%rd2+2], %r2;
 ; CHECK-NEXT:    ret;
   %t1 = load <3 x i8>, ptr %a
   store <3 x i8> %t1, ptr %b, align 16
@@ -766,10 +766,10 @@ define void @test_ldst_v4i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v4i8_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v4i8_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1];
-; CHECK-NEXT:    st.u32 [%rd2], %r1;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
+; CHECK-NEXT:    st.b32 [%rd2], %r1;
 ; CHECK-NEXT:    ret;
   %t1 = load <4 x i8>, ptr %a
   store <4 x i8> %t1, ptr %b, align 16
@@ -783,16 +783,16 @@ define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v4i8_unaligned_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v4i8_unaligned_param_0];
-; CHECK-NEXT:    ld.u8 %r1, [%rd1];
-; CHECK-NEXT:    ld.u8 %r2, [%rd1+1];
-; CHECK-NEXT:    ld.u8 %r3, [%rd1+2];
-; CHECK-NEXT:    ld.u8 %r4, [%rd1+3];
-; CHECK-NEXT:    st.u8 [%rd2+3], %r4;
-; CHECK-NEXT:    st.u8 [%rd2+2], %r3;
-; CHECK-NEXT:    st.u8 [%rd2+1], %r2;
-; CHECK-NEXT:    st.u8 [%rd2], %r1;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i8_unaligned_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i8_unaligned_param_0];
+; CHECK-NEXT:    ld.b8 %r1, [%rd1];
+; CHECK-NEXT:    ld.b8 %r2, [%rd1+1];
+; CHECK-NEXT:    ld.b8 %r3, [%rd1+2];
+; CHECK-NEXT:    ld.b8 %r4, [%rd1+3];
+; CHECK-NEXT:    st.b8 [%rd2+3], %r4;
+; CHECK-NEXT:    st.b8 [%rd2+2], %r3;
+; CHECK-NEXT:    st.b8 [%rd2+1], %r2;
+; CHECK-NEXT:    st.b8 [%rd2], %r1;
 ; CHECK-NEXT:    ret;
   %t1 = load <4 x i8>, ptr %a, align 1
   store <4 x i8> %t1, ptr %b, align 1
@@ -807,8 +807,8 @@ define void @test_ldst_v8i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v8i8_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v8i8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v8i8_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v8i8_param_0];
 ; CHECK-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    st.v2.b32 [%rd2], {%r1, %r2};
 ; CHECK-NEXT:    ret;
@@ -825,8 +825,8 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_call_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_call_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_call_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_call_param_0];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.b32 [param0], %r1;
@@ -853,8 +853,8 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_call_flipped_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_call_flipped_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_call_flipped_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_call_flipped_param_0];
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.b32 [param0], %r2;
@@ -881,8 +881,8 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_tailcall_flipped_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tailcall_flipped_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_tailcall_flipped_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tailcall_flipped_param_0];
 ; CHECK-NEXT:    { // callseq 2, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.b32 [param0], %r2;
@@ -911,11 +911,11 @@ define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [test_select_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_select_param_2];
 ; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
-; CHECK-NEXT:    ld.param.u32 %r2, [test_select_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_select_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_select_param_0];
 ; CHECK-NEXT:    selp.b32 %r3, %r1, %r2, %p1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -930,10 +930,10 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
 ; CHECK-NEXT:    .reg .b32 %r<28>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r4, [test_select_cc_param_3];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_select_cc_param_2];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_select_cc_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_select_cc_param_0];
+; CHECK-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
 ; CHECK-NEXT:    bfe.u32 %r5, %r4, 0, 8;
 ; CHECK-NEXT:    bfe.u32 %r6, %r3, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r6, %r5;
@@ -975,10 +975,10 @@ define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
 ; CHECK-NEXT:    .reg .b32 %r<23>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
-; CHECK-NEXT:    ld.param.u32 %r10, [test_select_cc_i32_i8_param_3];
-; CHECK-NEXT:    ld.param.u32 %r9, [test_select_cc_i32_i8_param_2];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
+; CHECK-NEXT:    ld.param.b32 %r10, [test_select_cc_i32_i8_param_3];
+; CHECK-NEXT:    ld.param.b32 %r9, [test_select_cc_i32_i8_param_2];
 ; CHECK-NEXT:    bfe.u32 %r11, %r10, 0, 8;
 ; CHECK-NEXT:    bfe.u32 %r12, %r9, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r12, %r11;
@@ -1010,10 +1010,10 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
 ; CHECK-NEXT:    .reg .b32 %r<26>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3];
-; CHECK-NEXT:    ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_select_cc_i8_i32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_select_cc_i8_i32_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3];
+; CHECK-NEXT:    ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_i8_i32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_i8_i32_param_0];
 ; CHECK-NEXT:    setp.ne.s32 %p1, %r3, %r7;
 ; CHECK-NEXT:    setp.ne.s32 %p2, %r4, %r8;
 ; CHECK-NEXT:    setp.ne.s32 %p3, %r5, %r9;
@@ -1048,7 +1048,7 @@ define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0];
 ; CHECK-NEXT:    prmt.b32 %r5, %r3, %r4, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r6, %r1, %r2, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r7, %r6, %r5, 0x5410U;
@@ -1065,8 +1065,8 @@ define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
 ; CHECK-NEXT:    cvt.u32.u64 %r1, %rd4;
 ; CHECK-NEXT:    cvt.u32.u64 %r2, %rd3;
 ; CHECK-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
@@ -1086,7 +1086,7 @@ define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_zext_2xi32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_zext_2xi32_param_0];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
 ; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
@@ -1104,7 +1104,7 @@ define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_zext_2xi64_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_zext_2xi64_param_0];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u64.u32 %rd1, %r2;
 ; CHECK-NEXT:    and.b64 %rd2, %rd1, 255;
@@ -1130,7 +1130,7 @@ define <4 x i8> @test_bitcast_i32_to_4xi8(i32 %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_i32_to_4xi8_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_i32_to_4xi8_param_0];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %r = bitcast i32 %a to <4 x i8>
@@ -1144,7 +1144,7 @@ define <4 x i8> @test_bitcast_float_to_4xi8(float %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [test_bitcast_float_to_4xi8_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [test_bitcast_float_to_4xi8_param_0];
 ; CHECK-NEXT:    mov.b32 %r1, %f1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -1158,7 +1158,7 @@ define i32 @test_bitcast_4xi8_to_i32(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_4xi8_to_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_4xi8_to_i32_param_0];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %r = bitcast <4 x i8> %a to i32
@@ -1172,9 +1172,9 @@ define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_4xi8_to_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_4xi8_to_float_param_0];
 ; CHECK-NEXT:    mov.b32 %f1, %r1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f1;
 ; CHECK-NEXT:    ret;
   %r = bitcast <4 x i8> %a to float
   ret float %r
@@ -1188,7 +1188,7 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0];
 ; CHECK-NEXT:    mov.b32 %r1, 6;
 ; CHECK-NEXT:    prmt.b32 %r2, %r1, 7, 0x3340U;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
@@ -1211,7 +1211,7 @@ define <4 x i8> @test_shufflevector(<4 x i8> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_shufflevector_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
 ; CHECK-NEXT:    // implicit-def: %r3
 ; CHECK-NEXT:    prmt.b32 %r2, %r1, %r3, 0x123U;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
@@ -1226,8 +1226,8 @@ define <4 x i8> @test_shufflevector_2(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_shufflevector_2_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_shufflevector_2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_shufflevector_2_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_shufflevector_2_param_0];
 ; CHECK-NEXT:    prmt.b32 %r3, %r1, %r2, 0x2537U;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -1243,8 +1243,8 @@ define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [test_insertelement_param_1];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_insertelement_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_insertelement_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
 ; CHECK-NEXT:    cvt.u32.u16 %r2, %rs1;
 ; CHECK-NEXT:    bfi.b32 %r3, %r2, %r1, 8, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -1260,7 +1260,7 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<12>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs4, %rs1;
@@ -1291,7 +1291,7 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<12>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs4, %rs1;
@@ -1323,11 +1323,11 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u64 %rd3, [test_srem_v4i8_param_2];
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_srem_v4i8_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_srem_v4i8_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1];
-; CHECK-NEXT:    ld.u32 %r2, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_srem_v4i8_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_srem_v4i8_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_srem_v4i8_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
+; CHECK-NEXT:    ld.b32 %r2, [%rd2];
 ; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs1, %r3;
 ; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
@@ -1355,7 +1355,7 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
 ; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
-; CHECK-NEXT:    st.u32 [%rd3], %r17;
+; CHECK-NEXT:    st.b32 [%rd3], %r17;
 ; CHECK-NEXT:    ret;
 entry:
   %t57 = load <4 x i8>, ptr %a, align 4
@@ -1379,17 +1379,17 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u64 %rd3, [test_srem_v3i8_param_2];
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_srem_v3i8_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_srem_v3i8_param_0];
-; CHECK-NEXT:    ld.u8 %rs1, [%rd1];
-; CHECK-NEXT:    ld.u8 %rs2, [%rd1+1];
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_srem_v3i8_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_srem_v3i8_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_srem_v3i8_param_0];
+; CHECK-NEXT:    ld.b8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.b8 %rs2, [%rd1+1];
 ; CHECK-NEXT:    shl.b16 %rs3, %rs2, 8;
 ; CHECK-NEXT:    or.b16 %rs4, %rs3, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
 ; CHECK-NEXT:    ld.s8 %rs5, [%rd1+2];
-; CHECK-NEXT:    ld.u8 %rs6, [%rd2];
-; CHECK-NEXT:    ld.u8 %rs7, [%rd2+1];
+; CHECK-NEXT:    ld.b8 %rs6, [%rd2];
+; CHECK-NEXT:    ld.b8 %rs7, [%rd2+1];
 ; CHECK-NEXT:    shl.b16 %rs8, %rs7, 8;
 ; CHECK-NEXT:    or.b16 %rs9, %rs8, %rs6;
 ; CHECK-NEXT:    cvt.u32.u16 %r2, %rs9;
@@ -1413,10 +1413,10 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    prmt.b32 %r13, %r9, %r10, 0x5410U;
 ; CHECK-NEXT:    rem.s16 %rs17, %rs5, %rs10;
 ; CHECK-NEXT:    mov.b32 {%rs18, _}, %r13;
-; CHECK-NEXT:    st.u8 [%rd3], %rs18;
+; CHECK-NEXT:    st.b8 [%rd3], %rs18;
 ; CHECK-NEXT:    shr.u16 %rs19, %rs18, 8;
-; CHECK-NEXT:    st.u8 [%rd3+1], %rs19;
-; CHECK-NEXT:    st.u8 [%rd3+2], %rs17;
+; CHECK-NEXT:    st.b8 [%rd3+1], %rs19;
+; CHECK-NEXT:    st.b8 [%rd3+2], %rs17;
 ; CHECK-NEXT:    ret;
 entry:
   %t57 = load <3 x i8>, ptr %a, align 1
@@ -1434,11 +1434,11 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u64 %rd3, [test_sext_v4i1_to_v4i8_param_2];
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_sext_v4i1_to_v4i8_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1];
-; CHECK-NEXT:    ld.u32 %r2, [%rd2];
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_sext_v4i1_to_v4i8_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_sext_v4i1_to_v4i8_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
+; CHECK-NEXT:    ld.b32 %r2, [%rd2];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
@@ -1458,7 +1458,7 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    selp.b32 %r15, -1, 0, %p1;
 ; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
-; CHECK-NEXT:    st.u32 [%rd3], %r17;
+; CHECK-NEXT:    st.b32 [%rd3], %r17;
 ; CHECK-NEXT:    ret;
 entry:
   %t1 = load <4 x i8>, ptr %a, align 4
diff --git a/llvm/test/CodeGen/NVPTX/idioms.ll b/llvm/test/CodeGen/NVPTX/idioms.ll
index efd61f905dab4..d41b9b9973351 100644
--- a/llvm/test/CodeGen/NVPTX/idioms.ll
+++ b/llvm/test/CodeGen/NVPTX/idioms.ll
@@ -40,7 +40,7 @@ define %struct.S16 @i32_to_2xi16(i32 noundef %in) {
   %low = trunc i32 %in to i16
   %high32 = lshr i32 %in, 16
   %high = trunc i32 %high32 to i16
-; CHECK:       ld.param.u32  %[[R32:r[0-9]+]], [i32_to_2xi16_param_0];
+; CHECK:       ld.param.b32  %[[R32:r[0-9]+]], [i32_to_2xi16_param_0];
 ; CHECK-DAG:   cvt.u16.u32   %rs{{[0-9+]}}, %[[R32]];
 ; CHECK-DAG    mov.b32       {tmp, %rs{{[0-9+]}}}, %[[R32]];
   %s1 = insertvalue %struct.S16 poison, i16 %low, 0
@@ -54,7 +54,7 @@ define %struct.S16 @i32_to_2xi16_lh(i32 noundef %in) {
   %high32 = lshr i32 %in, 16
   %high = trunc i32 %high32 to i16
   %low = trunc i32 %in to i16
-; CHECK:       ld.param.u32  %[[R32:r[0-9]+]], [i32_to_2xi16_lh_param_0];
+; CHECK:       ld.param.b32  %[[R32:r[0-9]+]], [i32_to_2xi16_lh_param_0];
 ; CHECK-DAG:   cvt.u16.u32   %rs{{[0-9+]}}, %[[R32]];
 ; CHECK-DAG    mov.b32       {tmp, %rs{{[0-9+]}}}, %[[R32]];
   %s1 = insertvalue %struct.S16 poison, i16 %low, 0
@@ -82,7 +82,7 @@ define %struct.S32 @i64_to_2xi32(i64 noundef %in) {
   %low = trunc i64 %in to i32
   %high64 = lshr i64 %in, 32
   %high = trunc i64 %high64 to i32
-; CHECK:       ld.param.u64  %[[R64:rd[0-9]+]], [i64_to_2xi32_param_0];
+; CHECK:       ld.param.b64  %[[R64:rd[0-9]+]], [i64_to_2xi32_param_0];
 ; CHECK-DAG:   cvt.u32.u64   %r{{[0-9+]}}, %[[R64]];
 ; CHECK-DAG    mov.b64       {tmp, %r{{[0-9+]}}}, %[[R64]];
   %s1 = insertvalue %struct.S32 poison, i32 %low, 0
@@ -112,7 +112,7 @@ define %struct.S16 @i32_to_2xi16_shr(i32 noundef %i){
   %l = trunc i32 %i1 to i16
   %h32 = ashr i32 %i1, 16
   %h = trunc i32 %h32 to i16
-; CHECK:      ld.param.u32    %[[R32:r[0-9]+]], [i32_to_2xi16_shr_param_0];
+; CHECK:      ld.param.b32    %[[R32:r[0-9]+]], [i32_to_2xi16_shr_param_0];
 ; CHECK:      shr.s32         %[[R32H:r[0-9]+]], %[[R32]], 16;
 ; CHECK-DAG    mov.b32       {tmp, %rs{{[0-9+]}}}, %[[R32]];
 ; CHECK-DAG    mov.b32       {tmp, %rs{{[0-9+]}}}, %[[R32H]];
diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
index 4509fcfd1a9bc..1341a04c939c6 100644
--- a/llvm/test/CodeGen/NVPTX/indirect_byval.ll
+++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
@@ -22,9 +22,9 @@ define internal i32 @foo() {
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    mov.b64 %SPL, __local_depot0;
 ; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
-; CHECK-NEXT:    ld.global.u64 %rd1, [ptr];
+; CHECK-NEXT:    ld.global.b64 %rd1, [ptr];
 ; CHECK-NEXT:    add.u64 %rd3, %SPL, 1;
-; CHECK-NEXT:    ld.local.u8 %rs1, [%rd3];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd3];
 ; CHECK-NEXT:    add.u64 %rd4, %SP, 0;
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 1 .b8 param0[1];
@@ -65,9 +65,9 @@ define internal i32 @bar() {
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    mov.b64 %SPL, __local_depot1;
 ; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
-; CHECK-NEXT:    ld.global.u64 %rd1, [ptr];
+; CHECK-NEXT:    ld.global.b64 %rd1, [ptr];
 ; CHECK-NEXT:    add.u64 %rd3, %SPL, 8;
-; CHECK-NEXT:    ld.local.u64 %rd4, [%rd3];
+; CHECK-NEXT:    ld.local.b64 %rd4, [%rd3];
 ; CHECK-NEXT:    add.u64 %rd5, %SP, 0;
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
index 67c074ca73156..5cfdbb7447ad8 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
@@ -33,9 +33,9 @@ define void @test_b128_input_from_load(ptr nocapture readonly %data) {
 ; CHECK-NEXT:    .reg .b128 %rq<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_b128_input_from_load_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_b128_input_from_load_param_0];
 ; CHECK-NEXT:    cvta.to.global.u64 %rd3, %rd2;
-; CHECK-NEXT:    ld.global.v2.u64 {%rd4, %rd5}, [%rd3];
+; CHECK-NEXT:    ld.global.v2.b64 {%rd4, %rd5}, [%rd3];
 ; CHECK-NEXT:    mov.b64 %rd6, value;
 ; CHECK-NEXT:    cvta.global.u64 %rd1, %rd6;
 ; CHECK-NEXT:    mov.b128 %rq1, {%rd4, %rd5};
@@ -59,9 +59,9 @@ define void @test_b128_input_from_select(ptr nocapture readonly %flag) {
 ; CHECK-NEXT:    .reg .b128 %rq<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_b128_input_from_select_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_b128_input_from_select_param_0];
 ; CHECK-NEXT:    cvta.to.global.u64 %rd3, %rd2;
-; CHECK-NEXT:    ld.global.u8 %rs1, [%rd3];
+; CHECK-NEXT:    ld.global.b8 %rs1, [%rd3];
 ; CHECK-NEXT:    setp.eq.s16 %p1, %rs1, 0;
 ; CHECK-NEXT:    selp.b64 %rd4, 24, 42, %p1;
 ; CHECK-NEXT:    mov.b64 %rd5, 0;
@@ -93,7 +93,7 @@ define void @test_store_b128_output() {
 ; CHECK-NEXT:    mov.b128 {%rd1, %rd2}, %rq1;
 ; CHECK-NEXT:    add.cc.s64 %rd3, %rd1, 1;
 ; CHECK-NEXT:    addc.cc.s64 %rd4, %rd2, 0;
-; CHECK-NEXT:    st.global.v2.u64 [value], {%rd3, %rd4};
+; CHECK-NEXT:    st.global.v2.b64 [value], {%rd3, %rd4};
 ; CHECK-NEXT:    ret;
   %1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"()
   %add = add nsw i128 %1, 1
@@ -109,9 +109,9 @@ define void @test_use_of_b128_output(ptr nocapture readonly %data) {
 ; CHECK-NEXT:    .reg .b128 %rq<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_use_of_b128_output_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_use_of_b128_output_param_0];
 ; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
-; CHECK-NEXT:    ld.global.v2.u64 {%rd3, %rd4}, [%rd2];
+; CHECK-NEXT:    ld.global.v2.b64 {%rd3, %rd4}, [%rd2];
 ; CHECK-NEXT:    mov.b128 %rq2, {%rd3, %rd4};
 ; CHECK-NEXT:    // begin inline asm
 ; CHECK-NEXT:    { mov.b128 %rq1, %rq2; }
@@ -119,7 +119,7 @@ define void @test_use_of_b128_output(ptr nocapture readonly %data) {
 ; CHECK-NEXT:    mov.b128 {%rd5, %rd6}, %rq1;
 ; CHECK-NEXT:    add.cc.s64 %rd7, %rd5, 1;
 ; CHECK-NEXT:    addc.cc.s64 %rd8, %rd6, 0;
-; CHECK-NEXT:    st.global.v2.u64 [value], {%rd7, %rd8};
+; CHECK-NEXT:    st.global.v2.b64 [value], {%rd7, %rd8};
 ; CHECK-NEXT:    ret;
   %1 = addrspacecast ptr %data to ptr addrspace(1)
   %2 = load <2 x i64>, ptr addrspace(1) %1, align 16
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
index 8ca863bba5f4a..52bd51b3ef7f9 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
@@ -21,7 +21,7 @@ define void @test_corner_values() {
 ; CHECK-NEXT:    .reg .b128 %rq<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.global.u64 %rd1, [v64];
+; CHECK-NEXT:    ld.global.b64 %rd1, [v64];
 ; CHECK-NEXT:    add.s64 %rd2, %rd1, 8;
 ; CHECK-NEXT:    mov.b64 %rd13, -1;
 ; CHECK-NEXT:    mov.b128 %rq1, {%rd13, %rd13};
@@ -37,7 +37,7 @@ define void @test_corner_values() {
 ; CHECK-NEXT:    st.b128 [%rd3], %rq1;
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    // end inline asm
-; CHECK-NEXT:    ld.global.u64 %rd15, [v64];
+; CHECK-NEXT:    ld.global.b64 %rd15, [v64];
 ; CHECK-NEXT:    add.s64 %rd4, %rd15, 16;
 ; CHECK-NEXT:    add.s64 %rd5, %rd15, 24;
 ; CHECK-NEXT:    mov.b64 %rd16, 9223372036854775807;
@@ -54,7 +54,7 @@ define void @test_corner_values() {
 ; CHECK-NEXT:    st.b128 [%rd6], %rq2;
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    // end inline asm
-; CHECK-NEXT:    ld.global.u64 %rd18, [v64];
+; CHECK-NEXT:    ld.global.b64 %rd18, [v64];
 ; CHECK-NEXT:    add.s64 %rd7, %rd18, 32;
 ; CHECK-NEXT:    add.s64 %rd8, %rd18, 40;
 ; CHECK-NEXT:    mov.b64 %rd19, -9223372036854775808;
@@ -72,7 +72,7 @@ define void @test_corner_values() {
 ; CHECK-NEXT:    st.b128 [%rd9], %rq3;
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    // end inline asm
-; CHECK-NEXT:    ld.global.u64 %rd22, [v64];
+; CHECK-NEXT:    ld.global.b64 %rd22, [v64];
 ; CHECK-NEXT:    add.s64 %rd10, %rd22, 48;
 ; CHECK-NEXT:    add.s64 %rd11, %rd22, 56;
 ; CHECK-NEXT:    mov.b128 %rq4, {%rd20, %rd20};
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
index 4ea31dd52a321..6dbf44f38aa2f 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
@@ -19,7 +19,7 @@ define void @test_b128_in_loop() {
 ; CHECK-NEXT:    setp.eq.s64 %p1, %rd1, 0;
 ; CHECK-NEXT:    @%p1 bra $L__BB0_3;
 ; CHECK-NEXT:  // %bb.1: // %BB1
-; CHECK-NEXT:    ld.global.v2.u64 {%rd12, %rd13}, [x];
+; CHECK-NEXT:    ld.global.v2.b64 {%rd12, %rd13}, [x];
 ; CHECK-NEXT:    mov.b64 %rd14, 0;
 ; CHECK-NEXT:  $L__BB0_2: // %BB2
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -34,7 +34,7 @@ define void @test_b128_in_loop() {
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    // end inline asm
 ; CHECK-NEXT:    mov.b128 {%rd12, %rd13}, %rq1;
-; CHECK-NEXT:    st.global.v2.u64 [x], {%rd12, %rd13};
+; CHECK-NEXT:    st.global.v2.b64 [x], {%rd12, %rd13};
 ; CHECK-NEXT:    add.s64 %rd14, %rd14, 1;
 ; CHECK-NEXT:    setp.ne.s64 %p2, %rd1, %rd14;
 ; CHECK-NEXT:    @%p2 bra $L__BB0_2;
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index 01c51bb72d055..a8beeb287c225 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -10,9 +10,9 @@ define float @test_fabsf(float %f) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [test_fabsf_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [test_fabsf_param_0];
 ; CHECK-NEXT:    abs.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.fabs.f32(float %f)
   ret float %x
@@ -24,9 +24,9 @@ define double @test_fabs(double %d) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [test_fabs_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [test_fabs_param_0];
 ; CHECK-NEXT:    abs.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
 ; CHECK-NEXT:    ret;
   %x = call double @llvm.fabs.f64(double %d)
   ret double %x
@@ -38,9 +38,9 @@ define float @test_nvvm_sqrt(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [test_nvvm_sqrt_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [test_nvvm_sqrt_param_0];
 ; CHECK-NEXT:    sqrt.rn.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.sqrt.f(float %a)
   ret float %val
@@ -52,9 +52,9 @@ define float @test_llvm_sqrt(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [test_llvm_sqrt_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [test_llvm_sqrt_param_0];
 ; CHECK-NEXT:    sqrt.rn.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.sqrt.f32(float %a)
   ret float %val
@@ -66,7 +66,7 @@ define i32 @test_bitreverse32(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_bitreverse32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_bitreverse32_param_0];
 ; CHECK-NEXT:    brev.b32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -80,7 +80,7 @@ define i64 @test_bitreverse64(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_bitreverse64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_bitreverse64_param_0];
 ; CHECK-NEXT:    brev.b64 %rd2, %rd1;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
@@ -94,7 +94,7 @@ define i32 @test_popc32(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_popc32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_popc32_param_0];
 ; CHECK-NEXT:    popc.b32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -109,7 +109,7 @@ define i64 @test_popc64(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_popc64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_popc64_param_0];
 ; CHECK-NEXT:    popc.b64 %r1, %rd1;
 ; CHECK-NEXT:    cvt.u64.u32 %rd2, %r1;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
@@ -128,7 +128,7 @@ define i32 @test_popc64_trunc(i64 %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_popc64_trunc_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_popc64_trunc_param_0];
 ; CHECK-NEXT:    popc.b64 %r1, %rd1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -145,10 +145,10 @@ define void @test_popc16(i16 %a, ptr %b) {
 ; CHECK32-NEXT:    .reg .b32 %r<4>;
 ; CHECK32-EMPTY:
 ; CHECK32-NEXT:  // %bb.0:
-; CHECK32-NEXT:    ld.param.u16 %r1, [test_popc16_param_0];
+; CHECK32-NEXT:    ld.param.b16 %r1, [test_popc16_param_0];
 ; CHECK32-NEXT:    popc.b32 %r2, %r1;
-; CHECK32-NEXT:    ld.param.u32 %r3, [test_popc16_param_1];
-; CHECK32-NEXT:    st.u16 [%r3], %r2;
+; CHECK32-NEXT:    ld.param.b32 %r3, [test_popc16_param_1];
+; CHECK32-NEXT:    st.b16 [%r3], %r2;
 ; CHECK32-NEXT:    ret;
 ;
 ; CHECK64-LABEL: test_popc16(
@@ -157,10 +157,10 @@ define void @test_popc16(i16 %a, ptr %b) {
 ; CHECK64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK64-EMPTY:
 ; CHECK64-NEXT:  // %bb.0:
-; CHECK64-NEXT:    ld.param.u16 %r1, [test_popc16_param_0];
+; CHECK64-NEXT:    ld.param.b16 %r1, [test_popc16_param_0];
 ; CHECK64-NEXT:    popc.b32 %r2, %r1;
-; CHECK64-NEXT:    ld.param.u64 %rd1, [test_popc16_param_1];
-; CHECK64-NEXT:    st.u16 [%rd1], %r2;
+; CHECK64-NEXT:    ld.param.b64 %rd1, [test_popc16_param_1];
+; CHECK64-NEXT:    st.b16 [%rd1], %r2;
 ; CHECK64-NEXT:    ret;
   %val = call i16 @llvm.ctpop.i16(i16 %a)
   store i16 %val, ptr %b
@@ -175,7 +175,7 @@ define i32 @test_popc16_to_32(i16 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %r1, [test_popc16_to_32_param_0];
+; CHECK-NEXT:    ld.param.b16 %r1, [test_popc16_to_32_param_0];
 ; CHECK-NEXT:    popc.b32 %r2, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/jump-table.ll b/llvm/test/CodeGen/NVPTX/jump-table.ll
index dbd4f8a55facf..0718e6d603b6c 100644
--- a/llvm/test/CodeGen/NVPTX/jump-table.ll
+++ b/llvm/test/CodeGen/NVPTX/jump-table.ll
@@ -13,7 +13,7 @@ define void @foo(i32 %i) {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u32 %r2, [foo_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [foo_param_0];
 ; CHECK-NEXT:    setp.gt.u32 %p1, %r2, 3;
 ; CHECK-NEXT:    @%p1 bra $L__BB0_6;
 ; CHECK-NEXT:  // %bb.1: // %entry
@@ -25,19 +25,19 @@ define void @foo(i32 %i) {
 ; CHECK-NEXT:    brx.idx %r2, $L_brx_0;
 ; CHECK-NEXT:  $L__BB0_2: // %case0
 ; CHECK-NEXT:    mov.b32 %r6, 0;
-; CHECK-NEXT:    st.global.u32 [out], %r6;
+; CHECK-NEXT:    st.global.b32 [out], %r6;
 ; CHECK-NEXT:    bra.uni $L__BB0_6;
 ; CHECK-NEXT:  $L__BB0_4: // %case2
 ; CHECK-NEXT:    mov.b32 %r4, 2;
-; CHECK-NEXT:    st.global.u32 [out], %r4;
+; CHECK-NEXT:    st.global.b32 [out], %r4;
 ; CHECK-NEXT:    bra.uni $L__BB0_6;
 ; CHECK-NEXT:  $L__BB0_5: // %case3
 ; CHECK-NEXT:    mov.b32 %r3, 3;
-; CHECK-NEXT:    st.global.u32 [out], %r3;
+; CHECK-NEXT:    st.global.b32 [out], %r3;
 ; CHECK-NEXT:    bra.uni $L__BB0_6;
 ; CHECK-NEXT:  $L__BB0_3: // %case1
 ; CHECK-NEXT:    mov.b32 %r5, 1;
-; CHECK-NEXT:    st.global.u32 [out], %r5;
+; CHECK-NEXT:    st.global.b32 [out], %r5;
 ; CHECK-NEXT:  $L__BB0_6: // %end
 ; CHECK-NEXT:    ret;
 entry:
@@ -76,7 +76,7 @@ define i32 @test2(i32 %tmp158) {
 ; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u32 %r1, [test2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test2_param_0];
 ; CHECK-NEXT:    setp.gt.s32 %p1, %r1, 119;
 ; CHECK-NEXT:    @%p1 bra $L__BB1_4;
 ; CHECK-NEXT:  // %bb.1: // %entry
diff --git a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll
index ec20fda67ec0e..24071b48143f2 100644
--- a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll
+++ b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll
@@ -9,24 +9,24 @@
 ;; i8
 define i8 @ld_global_i8(ptr addrspace(1) %ptr) {
 ; ALL-LABEL: ld_global_i8
-; G32: ld.global.u8 %{{.*}}, [%r{{[0-9]+}}]
-; G64: ld.global.u8 %{{.*}}, [%rd{{[0-9]+}}]
+; G32: ld.global.b8 %{{.*}}, [%r{{[0-9]+}}]
+; G64: ld.global.b8 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load i8, ptr addrspace(1) %ptr
   ret i8 %a
 }
 define i8 @ld_shared_i8(ptr addrspace(3) %ptr) {
 ; ALL-LABEL: ld_shared_i8
-; LS32: ld.shared.u8 %{{.*}}, [%r{{[0-9]+}}]
-; LS64: ld.shared.u8 %{{.*}}, [%rd{{[0-9]+}}]
+; LS32: ld.shared.b8 %{{.*}}, [%r{{[0-9]+}}]
+; LS64: ld.shared.b8 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load i8, ptr addrspace(3) %ptr
   ret i8 %a
 }
 define i8 @ld_local_i8(ptr addrspace(5) %ptr) {
 ; ALL-LABEL: ld_local_i8
-; LS32: ld.local.u8 %{{.*}}, [%r{{[0-9]+}}]
-; LS64: ld.local.u8 %{{.*}}, [%rd{{[0-9]+}}]
+; LS32: ld.local.b8 %{{.*}}, [%r{{[0-9]+}}]
+; LS64: ld.local.b8 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load i8, ptr addrspace(5) %ptr
   ret i8 %a
@@ -35,24 +35,24 @@ define i8 @ld_local_i8(ptr addrspace(5) %ptr) {
 ;; i16
 define i16 @ld_global_i16(ptr addrspace(1) %ptr) {
 ; ALL-LABEL: ld_global_i16
-; G32: ld.global.u16 %{{.*}}, [%r{{[0-9]+}}]
-; G64: ld.global.u16 %{{.*}}, [%rd{{[0-9]+}}]
+; G32: ld.global.b16 %{{.*}}, [%r{{[0-9]+}}]
+; G64: ld.global.b16 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load i16, ptr addrspace(1) %ptr
   ret i16 %a
 }
 define i16 @ld_shared_i16(ptr addrspace(3) %ptr) {
 ; ALL-LABEL: ld_shared_i16
-; LS32: ld.shared.u16 %{{.*}}, [%r{{[0-9]+}}]
-; LS64: ld.shared.u16 %{{.*}}, [%rd{{[0-9]+}}]
+; LS32: ld.shared.b16 %{{.*}}, [%r{{[0-9]+}}]
+; LS64: ld.shared.b16 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load i16, ptr addrspace(3) %ptr
   ret i16 %a
 }
 define i16 @ld_local_i16(ptr addrspace(5) %ptr) {
 ; ALL-LABEL: ld_local_i16
-; LS32: ld.local.u16 %{{.*}}, [%r{{[0-9]+}}]
-; LS64: ld.local.u16 %{{.*}}, [%rd{{[0-9]+}}]
+; LS32: ld.local.b16 %{{.*}}, [%r{{[0-9]+}}]
+; LS64: ld.local.b16 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load i16, ptr addrspace(5) %ptr
   ret i16 %a
@@ -61,24 +61,24 @@ define i16 @ld_local_i16(ptr addrspace(5) %ptr) {
 ;; i32
 define i32 @ld_global_i32(ptr addrspace(1) %ptr) {
 ; ALL-LABEL: ld_global_i32
-; G32: ld.global.u32 %{{.*}}, [%r{{[0-9]+}}]
-; G64: ld.global.u32 %{{.*}}, [%rd{{[0-9]+}}]
+; G32: ld.global.b32 %{{.*}}, [%r{{[0-9]+}}]
+; G64: ld.global.b32 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load i32, ptr addrspace(1) %ptr
   ret i32 %a
 }
 define i32 @ld_shared_i32(ptr addrspace(3) %ptr) {
 ; ALL-LABEL: ld_shared_i32
-; LS32: ld.shared.u32 %{{.*}}, [%r{{[0-9]+}}]
-; LS64: ld.shared.u32 %{{.*}}, [%rd{{[0-9]+}}]
+; LS32: ld.shared.b32 %{{.*}}, [%r{{[0-9]+}}]
+; LS64: ld.shared.b32 %{{.*}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i32, ptr addrspace(3) %ptr
   ret i32 %a
 }
 define i32 @ld_local_i32(ptr addrspace(5) %ptr) {
 ; ALL-LABEL: ld_local_i32
-; LS32: ld.local.u32 %{{.*}}, [%r{{[0-9]+}}]
-; LS64: ld.local.u32 %{{.*}}, [%rd{{[0-9]+}}]
+; LS32: ld.local.b32 %{{.*}}, [%r{{[0-9]+}}]
+; LS64: ld.local.b32 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load i32, ptr addrspace(5) %ptr
   ret i32 %a
@@ -87,24 +87,24 @@ define i32 @ld_local_i32(ptr addrspace(5) %ptr) {
 ;; i64
 define i64 @ld_global_i64(ptr addrspace(1) %ptr) {
 ; ALL-LABEL: ld_global_i64
-; G32: ld.global.u64 %{{.*}}, [%r{{[0-9]+}}]
-; G64: ld.global.u64 %{{.*}}, [%rd{{[0-9]+}}]
+; G32: ld.global.b64 %{{.*}}, [%r{{[0-9]+}}]
+; G64: ld.global.b64 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load i64, ptr addrspace(1) %ptr
   ret i64 %a
 }
 define i64 @ld_shared_i64(ptr addrspace(3) %ptr) {
 ; ALL-LABEL: ld_shared_i64
-; LS32: ld.shared.u64 %{{.*}}, [%r{{[0-9]+}}]
-; LS64: ld.shared.u64 %{{.*}}, [%rd{{[0-9]+}}]
+; LS32: ld.shared.b64 %{{.*}}, [%r{{[0-9]+}}]
+; LS64: ld.shared.b64 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load i64, ptr addrspace(3) %ptr
   ret i64 %a
 }
 define i64 @ld_local_i64(ptr addrspace(5) %ptr) {
 ; ALL-LABEL: ld_local_i64
-; LS32: ld.local.u64 %{{.*}}, [%r{{[0-9]+}}]
-; LS64: ld.local.u64 %{{.*}}, [%rd{{[0-9]+}}]
+; LS32: ld.local.b64 %{{.*}}, [%r{{[0-9]+}}]
+; LS64: ld.local.b64 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load i64, ptr addrspace(5) %ptr
   ret i64 %a
@@ -113,24 +113,24 @@ define i64 @ld_local_i64(ptr addrspace(5) %ptr) {
 ;; f32
 define float @ld_global_f32(ptr addrspace(1) %ptr) {
 ; ALL-LABEL: ld_global_f32
-; G32: ld.global.f32 %{{.*}}, [%r{{[0-9]+}}]
-; G64: ld.global.f32 %{{.*}}, [%rd{{[0-9]+}}]
+; G32: ld.global.b32 %{{.*}}, [%r{{[0-9]+}}]
+; G64: ld.global.b32 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load float, ptr addrspace(1) %ptr
   ret float %a
 }
 define float @ld_shared_f32(ptr addrspace(3) %ptr) {
 ; ALL-LABEL: ld_shared_f32
-; LS32: ld.shared.f32 %{{.*}}, [%r{{[0-9]+}}]
-; LS64: ld.shared.f32 %{{.*}}, [%rd{{[0-9]+}}]
+; LS32: ld.shared.b32 %{{.*}}, [%r{{[0-9]+}}]
+; LS64: ld.shared.b32 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load float, ptr addrspace(3) %ptr
   ret float %a
 }
 define float @ld_local_f32(ptr addrspace(5) %ptr) {
 ; ALL-LABEL: ld_local_f32
-; LS32: ld.local.f32 %{{.*}}, [%r{{[0-9]+}}]
-; LS64: ld.local.f32 %{{.*}}, [%rd{{[0-9]+}}]
+; LS32: ld.local.b32 %{{.*}}, [%r{{[0-9]+}}]
+; LS64: ld.local.b32 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load float, ptr addrspace(5) %ptr
   ret float %a
@@ -139,24 +139,24 @@ define float @ld_local_f32(ptr addrspace(5) %ptr) {
 ;; f64
 define double @ld_global_f64(ptr addrspace(1) %ptr) {
 ; ALL-LABEL: ld_global_f64
-; G32: ld.global.f64 %{{.*}}, [%r{{[0-9]+}}]
-; G64: ld.global.f64 %{{.*}}, [%rd{{[0-9]+}}]
+; G32: ld.global.b64 %{{.*}}, [%r{{[0-9]+}}]
+; G64: ld.global.b64 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load double, ptr addrspace(1) %ptr
   ret double %a
 }
 define double @ld_shared_f64(ptr addrspace(3) %ptr) {
 ; ALL-LABEL: ld_shared_f64
-; LS32: ld.shared.f64 %{{.*}}, [%r{{[0-9]+}}]
-; LS64: ld.shared.f64 %{{.*}}, [%rd{{[0-9]+}}]
+; LS32: ld.shared.b64 %{{.*}}, [%r{{[0-9]+}}]
+; LS64: ld.shared.b64 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load double, ptr addrspace(3) %ptr
   ret double %a
 }
 define double @ld_local_f64(ptr addrspace(5) %ptr) {
 ; ALL-LABEL: ld_local_f64
-; LS32: ld.local.f64 %{{.*}}, [%r{{[0-9]+}}]
-; LS64: ld.local.f64 %{{.*}}, [%rd{{[0-9]+}}]
+; LS32: ld.local.b64 %{{.*}}, [%r{{[0-9]+}}]
+; LS64: ld.local.b64 %{{.*}}, [%rd{{[0-9]+}}]
 ; ALL: ret
   %a = load double, ptr addrspace(5) %ptr
   ret double %a
diff --git a/llvm/test/CodeGen/NVPTX/ld-generic.ll b/llvm/test/CodeGen/NVPTX/ld-generic.ll
index cfc4491ded1e4..ce922dd8a5ac9 100644
--- a/llvm/test/CodeGen/NVPTX/ld-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/ld-generic.ll
@@ -6,9 +6,9 @@
 
 ;; i8
 define i8 @ld_global_i8(ptr addrspace(0) %ptr) {
-; PTX32: ld.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.b8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+; PTX64: ld.b8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i8, ptr addrspace(0) %ptr
   ret i8 %a
@@ -16,9 +16,9 @@ define i8 @ld_global_i8(ptr addrspace(0) %ptr) {
 
 ;; i16
 define i16 @ld_global_i16(ptr addrspace(0) %ptr) {
-; PTX32: ld.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.b16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+; PTX64: ld.b16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i16, ptr addrspace(0) %ptr
   ret i16 %a
@@ -26,9 +26,9 @@ define i16 @ld_global_i16(ptr addrspace(0) %ptr) {
 
 ;; i32
 define i32 @ld_global_i32(ptr addrspace(0) %ptr) {
-; PTX32: ld.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.b32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+; PTX64: ld.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i32, ptr addrspace(0) %ptr
   ret i32 %a
@@ -36,9 +36,9 @@ define i32 @ld_global_i32(ptr addrspace(0) %ptr) {
 
 ;; i64
 define i64 @ld_global_i64(ptr addrspace(0) %ptr) {
-; PTX32: ld.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+; PTX64: ld.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i64, ptr addrspace(0) %ptr
   ret i64 %a
@@ -46,9 +46,9 @@ define i64 @ld_global_i64(ptr addrspace(0) %ptr) {
 
 ;; f32
 define float @ld_global_f32(ptr addrspace(0) %ptr) {
-; PTX32: ld.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.b32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+; PTX64: ld.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load float, ptr addrspace(0) %ptr
   ret float %a
@@ -56,9 +56,9 @@ define float @ld_global_f32(ptr addrspace(0) %ptr) {
 
 ;; f64
 define double @ld_global_f64(ptr addrspace(0) %ptr) {
-; PTX32: ld.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.b64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+; PTX64: ld.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load double, ptr addrspace(0) %ptr
   ret double %a
diff --git a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
index 3f2ec2606e5e9..3f0c6b0291251 100644
--- a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
+++ b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py
@@ -12,6 +12,18 @@
 from itertools import product
 from string import Template
 
+
+llvm_type_to_ptx_load_type = {
+    "i8": "b8",
+    "i16": "b16",
+    "i32": "b32",
+    "i64": "b64",
+    "half": "b16",
+    "<2 x half>": "b32",
+    "float": "b32",
+    "double": "b64",
+}
+
 llvm_type_to_ptx_type = {
     "i8": "u8",
     "i16": "u16",
@@ -48,8 +60,8 @@ def gen_load_tests():
     load_template = """
 define ${type} @${testname}(${type} addrspace(${asid})* %ptr) {
 ; CHECK: ${testname}
-; CHECK_P32: ld${_volatile}${_volatile_as}.${ptx_type} %${ptx_reg}{{[0-9]+}}, [%r{{[0-9]+}}]
-; CHECK_P64: ld${_volatile}${_volatile_as}.${ptx_type} %${ptx_reg}{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK_P32: ld${_volatile}${_volatile_as}.${ptx_load_type} %${ptx_reg}{{[0-9]+}}, [%r{{[0-9]+}}]
+; CHECK_P64: ld${_volatile}${_volatile_as}.${ptx_load_type} %${ptx_reg}{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; CHECK: ret
   %p = ${generic_ptr}
   %a = load ${volatile} ${type}, ${type}* %p
@@ -81,6 +93,7 @@ def gen_load_tests():
             "_space": space,
             "ptx_reg": llvm_type_to_ptx_reg[op_type],
             "ptx_type": llvm_type_to_ptx_type[op_type],
+            "ptx_load_type": llvm_type_to_ptx_load_type[op_type],
             "asid": addrspace_id[space],
         }
 
diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
index 2fe2d28320f06..3a342e4d838c6 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
@@ -12,8 +12,8 @@ define i32 @ld_global(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_param_0];
-; CHECK-NEXT:    ld.global.nc.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [ld_global_param_0];
+; CHECK-NEXT:    ld.global.nc.b32 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %a = load i32, ptr addrspace(1) %ptr, !invariant.load !0
@@ -31,8 +31,8 @@ define half @ld_global_v2f16(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v2f16_param_0];
-; CHECK-NEXT:    ld.global.nc.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [ld_global_v2f16_param_0];
+; CHECK-NEXT:    ld.global.nc.b32 %r1, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
 ; CHECK-NEXT:    cvt.f32.f16 %f2, %rs1;
@@ -58,8 +58,8 @@ define half @ld_global_v4f16(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v4f16_param_0];
-; CHECK-NEXT:    ld.global.nc.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [ld_global_v4f16_param_0];
+; CHECK-NEXT:    ld.global.nc.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
 ; CHECK-NEXT:    cvt.f32.f16 %f1, %rs2;
 ; CHECK-NEXT:    cvt.f32.f16 %f2, %rs1;
 ; CHECK-NEXT:    add.rn.f32 %f3, %f2, %f1;
@@ -96,8 +96,8 @@ define half @ld_global_v8f16(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v8f16_param_0];
-; CHECK-NEXT:    ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [ld_global_v8f16_param_0];
+; CHECK-NEXT:    ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; }
 ; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r4; }
 ; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r1; }
@@ -135,8 +135,8 @@ define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v8i8_param_0];
-; CHECK-NEXT:    ld.global.nc.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [ld_global_v8i8_param_0];
+; CHECK-NEXT:    ld.global.nc.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
 ; CHECK-NEXT:    bfe.u32 %r4, %r2, 0, 8;
@@ -171,8 +171,8 @@ define i8 @ld_global_v16i8(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v16i8_param_0];
-; CHECK-NEXT:    ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [ld_global_v16i8_param_0];
+; CHECK-NEXT:    ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r5, %r4, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
 ; CHECK-NEXT:    bfe.u32 %r6, %r4, 0, 8;
@@ -226,8 +226,8 @@ define i32 @ld_global_v2i32(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v2i32_param_0];
-; CHECK-NEXT:    ld.global.nc.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [ld_global_v2i32_param_0];
+; CHECK-NEXT:    ld.global.nc.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -245,8 +245,8 @@ define i32 @ld_global_v4i32(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [ld_global_v4i32_param_0];
-; CHECK-NEXT:    ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [ld_global_v4i32_param_0];
+; CHECK-NEXT:    ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r5, %r1, %r2;
 ; CHECK-NEXT:    add.s32 %r6, %r3, %r4;
 ; CHECK-NEXT:    add.s32 %r7, %r5, %r6;
@@ -270,8 +270,8 @@ define i32 @ld_not_invariant(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [ld_not_invariant_param_0];
-; CHECK-NEXT:    ld.global.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [ld_not_invariant_param_0];
+; CHECK-NEXT:    ld.global.b32 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %a = load i32, ptr addrspace(1) %ptr
@@ -285,8 +285,8 @@ define i32 @ld_not_global_addrspace(ptr addrspace(0) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [ld_not_global_addrspace_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [ld_not_global_addrspace_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %a = load i32, ptr addrspace(0) %ptr
diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
index 12910f59f2586..0a528f0e8da06 100644
--- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
+++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll
@@ -5,9 +5,9 @@ declare <4 x float> @bar()
 
 ; CHECK-LABEL: .func foo(
 define void @foo(ptr %ptr) {
-; CHECK:     ld.param.u64 %[[PTR:rd[0-9]+]], [foo_param_0];
-; CHECK:     ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0];
-; CHECK:     st.v4.f32    [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]}
+; CHECK:     ld.param.b64 %[[PTR:rd[0-9]+]], [foo_param_0];
+; CHECK:     ld.param.v4.b32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0];
+; CHECK:     st.v4.b32    [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]}
   %val = tail call <4 x float> @bar()
   store <4 x float> %val, ptr %ptr
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/ldu-i8.ll b/llvm/test/CodeGen/NVPTX/ldu-i8.ll
index 93f3326b70bf1..89f23f30f34e8 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-i8.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-i8.ll
@@ -7,7 +7,7 @@ declare i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr, i32)
 
 define i8 @foo(ptr %a) {
 ; Ensure we properly truncate off the high-order 24 bits
-; CHECK:        ldu.global.u8
+; CHECK:        ldu.global.b8
 ; CHECK:        cvt.u32.u16
 ; CHECK:        and.b32         %r{{[0-9]+}}, %r{{[0-9]+}}, 255
   %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr %a, i32 4)
diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
index 2c1550aa082f0..be2e896f57009 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -31,8 +31,8 @@ define i8 @test_ldu_i8(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_i8_param_0];
-; CHECK-NEXT:    ldu.global.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldu_i8_param_0];
+; CHECK-NEXT:    ldu.global.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    and.b32 %r2, %r1, 255;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
@@ -49,8 +49,8 @@ define i16 @test_ldu_i16(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_i16_param_0];
-; CHECK-NEXT:    ldu.global.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldu_i16_param_0];
+; CHECK-NEXT:    ldu.global.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -65,8 +65,8 @@ define i32 @test_ldu_i32(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_i32_param_0];
-; CHECK-NEXT:    ldu.global.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldu_i32_param_0];
+; CHECK-NEXT:    ldu.global.b32 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
@@ -79,8 +79,8 @@ define i64 @test_ldu_i64(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_i64_param_0];
-; CHECK-NEXT:    ldu.global.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldu_i64_param_0];
+; CHECK-NEXT:    ldu.global.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %val = tail call i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
@@ -93,8 +93,8 @@ define ptr @test_ldu_p(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_p_param_0];
-; CHECK-NEXT:    ldu.global.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldu_p_param_0];
+; CHECK-NEXT:    ldu.global.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %val = tail call ptr @llvm.nvvm.ldu.global.p.p1(ptr addrspace(1) %ptr, i32 8)
@@ -108,9 +108,9 @@ define float @test_ldu_f32(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_f32_param_0];
-; CHECK-NEXT:    ldu.global.f32 %f1, [%rd1];
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldu_f32_param_0];
+; CHECK-NEXT:    ldu.global.b32 %f1, [%rd1];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f1;
 ; CHECK-NEXT:    ret;
   %val = tail call float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
   ret float %val
@@ -123,9 +123,9 @@ define double @test_ldu_f64(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %fd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_f64_param_0];
-; CHECK-NEXT:    ldu.global.f64 %fd1, [%rd1];
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd1;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldu_f64_param_0];
+; CHECK-NEXT:    ldu.global.b64 %fd1, [%rd1];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd1;
 ; CHECK-NEXT:    ret;
   %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
   ret double %val
@@ -138,8 +138,8 @@ define half @test_ldu_f16(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_f16_param_0];
-; CHECK-NEXT:    ldu.global.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldu_f16_param_0];
+; CHECK-NEXT:    ldu.global.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   %val = tail call half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
@@ -153,8 +153,8 @@ define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldu_v2f16_param_0];
-; CHECK-NEXT:    ldu.global.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldu_v2f16_param_0];
+; CHECK-NEXT:    ldu.global.b32 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
@@ -169,8 +169,8 @@ define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_i8_param_0];
-; CHECK-NEXT:    ld.global.nc.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_i8_param_0];
+; CHECK-NEXT:    ld.global.nc.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    cvt.u32.u8 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -186,8 +186,8 @@ define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_i16_param_0];
-; CHECK-NEXT:    ld.global.nc.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_i16_param_0];
+; CHECK-NEXT:    ld.global.nc.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -202,8 +202,8 @@ define i32 @test_ldg_i32(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_i32_param_0];
-; CHECK-NEXT:    ld.global.nc.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_i32_param_0];
+; CHECK-NEXT:    ld.global.nc.b32 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4)
@@ -216,8 +216,8 @@ define i64 @test_ldg_i64(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_i64_param_0];
-; CHECK-NEXT:    ld.global.nc.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_i64_param_0];
+; CHECK-NEXT:    ld.global.nc.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8)
@@ -230,8 +230,8 @@ define ptr @test_ldg_p(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_p_param_0];
-; CHECK-NEXT:    ld.global.nc.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_p_param_0];
+; CHECK-NEXT:    ld.global.nc.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %val = tail call ptr @llvm.nvvm.ldg.global.p.p1(ptr addrspace(1) %ptr, i32 8)
@@ -245,9 +245,9 @@ define float @test_ldg_f32(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_f32_param_0];
-; CHECK-NEXT:    ld.global.nc.f32 %f1, [%rd1];
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_f32_param_0];
+; CHECK-NEXT:    ld.global.nc.b32 %f1, [%rd1];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f1;
 ; CHECK-NEXT:    ret;
   %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4)
   ret float %val
@@ -260,9 +260,9 @@ define double @test_ldg_f64(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %fd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_f64_param_0];
-; CHECK-NEXT:    ld.global.nc.f64 %fd1, [%rd1];
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd1;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_f64_param_0];
+; CHECK-NEXT:    ld.global.nc.b64 %fd1, [%rd1];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd1;
 ; CHECK-NEXT:    ret;
   %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8)
   ret double %val
@@ -275,8 +275,8 @@ define half @test_ldg_f16(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_f16_param_0];
-; CHECK-NEXT:    ld.global.nc.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_f16_param_0];
+; CHECK-NEXT:    ld.global.nc.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2)
@@ -290,8 +290,8 @@ define <2 x half> @test_ldg_v2f16(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldg_v2f16_param_0];
-; CHECK-NEXT:    ld.global.nc.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_v2f16_param_0];
+; CHECK-NEXT:    ld.global.nc.b32 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
@@ -306,7 +306,7 @@ define i32 @test_ldg_asi() {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.global.nc.u32 %r1, [g+4];
+; CHECK-NEXT:    ld.global.nc.b32 %r1, [g+4];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g, i32 4), i32 4)
@@ -319,7 +319,7 @@ define i32 @test_lug_asi() {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ldu.global.u32 %r1, [g+4];
+; CHECK-NEXT:    ldu.global.b32 %r1, [g+4];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g, i32 4), i32 4)
diff --git a/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
index 8b740117c55e1..f1f6be9750fb3 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 
 define void @reg_plus_offset(ptr %a) {
-; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%rd{{[0-9]+}}+32];
-; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%rd{{[0-9]+}}+36];
+; CHECK:        ldu.global.b32  %r{{[0-9]+}}, [%rd{{[0-9]+}}+32];
+; CHECK:        ldu.global.b32  %r{{[0-9]+}}, [%rd{{[0-9]+}}+36];
   %p2 = getelementptr i32, ptr %a, i32 8
   %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr %p2, i32 4)
   %p3 = getelementptr i32, ptr %a, i32 9
diff --git a/llvm/test/CodeGen/NVPTX/load-sext-i1.ll b/llvm/test/CodeGen/NVPTX/load-sext-i1.ll
index fd1492414bf8c..5952097f4cd22 100644
--- a/llvm/test/CodeGen/NVPTX/load-sext-i1.ll
+++ b/llvm/test/CodeGen/NVPTX/load-sext-i1.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "nvptx-nvidia-cuda"
 
 define void @main(ptr %a1, i32 %a2, ptr %arg3) {
-; CHECK: ld.u8
-; CHECK-NOT: ld.u1
+; CHECK: ld.b8
+; CHECK-NOT: ld.b1
   %t1 = getelementptr i1, ptr %a1, i32 %a2
   %t2 = load i1, ptr %t1
   %t3 = sext i1 %t2 to i32
diff --git a/llvm/test/CodeGen/NVPTX/load-store-scalars.ll b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll
index cb2e247bd78c1..468e19492bfd5 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-scalars.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll
@@ -24,10 +24,10 @@ define void @generic_i8(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_i8_param_0];
-; CHECK-NEXT:    ld.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_i8_param_0];
+; CHECK-NEXT:    ld.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load i8, ptr %a
   %a.add = add i8 %a.load, 1
@@ -42,10 +42,10 @@ define void @generic_i16(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_i16_param_0];
-; CHECK-NEXT:    ld.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_i16_param_0];
+; CHECK-NEXT:    ld.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load i16, ptr %a
   %a.add = add i16 %a.load, 1
@@ -60,10 +60,10 @@ define void @generic_i32(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_i32_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_i32_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load i32, ptr %a
   %a.add = add i32 %a.load, 1
@@ -77,10 +77,10 @@ define void @generic_i64(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_i64_param_0];
-; CHECK-NEXT:    ld.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_i64_param_0];
+; CHECK-NEXT:    ld.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load i64, ptr %a
   %a.add = add i64 %a.load, 1
@@ -95,10 +95,10 @@ define void @generic_float(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_float_param_0];
-; CHECK-NEXT:    ld.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_float_param_0];
+; CHECK-NEXT:    ld.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load float, ptr %a
   %a.add = fadd float %a.load, 1.
@@ -113,10 +113,10 @@ define void @generic_double(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_double_param_0];
-; CHECK-NEXT:    ld.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_double_param_0];
+; CHECK-NEXT:    ld.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load double, ptr %a
   %a.add = fadd double %a.load, 1.
@@ -133,10 +133,10 @@ define void @generic_volatile_i8(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_i8_param_0];
-; CHECK-NEXT:    ld.volatile.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_i8_param_0];
+; CHECK-NEXT:    ld.volatile.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i8, ptr %a
   %a.add = add i8 %a.load, 1
@@ -151,10 +151,10 @@ define void @generic_volatile_i16(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_i16_param_0];
-; CHECK-NEXT:    ld.volatile.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_i16_param_0];
+; CHECK-NEXT:    ld.volatile.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i16, ptr %a
   %a.add = add i16 %a.load, 1
@@ -169,10 +169,10 @@ define void @generic_volatile_i32(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_i32_param_0];
-; CHECK-NEXT:    ld.volatile.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_i32_param_0];
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.volatile.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.volatile.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i32, ptr %a
   %a.add = add i32 %a.load, 1
@@ -186,10 +186,10 @@ define void @generic_volatile_i64(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_i64_param_0];
-; CHECK-NEXT:    ld.volatile.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_i64_param_0];
+; CHECK-NEXT:    ld.volatile.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.volatile.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.volatile.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i64, ptr %a
   %a.add = add i64 %a.load, 1
@@ -204,10 +204,10 @@ define void @generic_volatile_float(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_float_param_0];
-; CHECK-NEXT:    ld.volatile.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_float_param_0];
+; CHECK-NEXT:    ld.volatile.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.volatile.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile float, ptr %a
   %a.add = fadd float %a.load, 1.
@@ -222,10 +222,10 @@ define void @generic_volatile_double(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_double_param_0];
-; CHECK-NEXT:    ld.volatile.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_double_param_0];
+; CHECK-NEXT:    ld.volatile.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.volatile.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile double, ptr %a
   %a.add = fadd double %a.load, 1.
@@ -242,10 +242,10 @@ define void @generic_unordered_sys_i8(ptr %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0];
-; SM60-NEXT:    ld.volatile.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_i8_param_0];
+; SM60-NEXT:    ld.volatile.b8 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.u8 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.b8 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_unordered_sys_i8(
@@ -254,10 +254,10 @@ define void @generic_unordered_sys_i8(ptr %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0];
-; SM70-NEXT:    ld.relaxed.sys.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_i8_param_0];
+; SM70-NEXT:    ld.relaxed.sys.b8 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.relaxed.sys.u8 [%rd1], %rs2;
+; SM70-NEXT:    st.relaxed.sys.b8 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i8, ptr %a unordered, align 1
   %a.add = add i8 %a.load, 1
@@ -272,10 +272,10 @@ define void @generic_unordered_sys_i16(ptr %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0];
-; SM60-NEXT:    ld.volatile.u16 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_i16_param_0];
+; SM60-NEXT:    ld.volatile.b16 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.u16 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.b16 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_unordered_sys_i16(
@@ -284,10 +284,10 @@ define void @generic_unordered_sys_i16(ptr %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0];
-; SM70-NEXT:    ld.relaxed.sys.u16 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_i16_param_0];
+; SM70-NEXT:    ld.relaxed.sys.b16 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.relaxed.sys.u16 [%rd1], %rs2;
+; SM70-NEXT:    st.relaxed.sys.b16 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i16, ptr %a unordered, align 2
   %a.add = add i16 %a.load, 1
@@ -302,10 +302,10 @@ define void @generic_unordered_sys_i32(ptr %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0];
-; SM60-NEXT:    ld.volatile.u32 %r1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_i32_param_0];
+; SM60-NEXT:    ld.volatile.b32 %r1, [%rd1];
 ; SM60-NEXT:    add.s32 %r2, %r1, 1;
-; SM60-NEXT:    st.volatile.u32 [%rd1], %r2;
+; SM60-NEXT:    st.volatile.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_unordered_sys_i32(
@@ -314,10 +314,10 @@ define void @generic_unordered_sys_i32(ptr %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0];
-; SM70-NEXT:    ld.relaxed.sys.u32 %r1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_i32_param_0];
+; SM70-NEXT:    ld.relaxed.sys.b32 %r1, [%rd1];
 ; SM70-NEXT:    add.s32 %r2, %r1, 1;
-; SM70-NEXT:    st.relaxed.sys.u32 [%rd1], %r2;
+; SM70-NEXT:    st.relaxed.sys.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i32, ptr %a unordered, align 4
   %a.add = add i32 %a.load, 1
@@ -331,10 +331,10 @@ define void @generic_unordered_sys_i64(ptr %a) {
 ; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0];
-; SM60-NEXT:    ld.volatile.u64 %rd2, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_i64_param_0];
+; SM60-NEXT:    ld.volatile.b64 %rd2, [%rd1];
 ; SM60-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM60-NEXT:    st.volatile.u64 [%rd1], %rd3;
+; SM60-NEXT:    st.volatile.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_unordered_sys_i64(
@@ -342,10 +342,10 @@ define void @generic_unordered_sys_i64(ptr %a) {
 ; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0];
-; SM70-NEXT:    ld.relaxed.sys.u64 %rd2, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_i64_param_0];
+; SM70-NEXT:    ld.relaxed.sys.b64 %rd2, [%rd1];
 ; SM70-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM70-NEXT:    st.relaxed.sys.u64 [%rd1], %rd3;
+; SM70-NEXT:    st.relaxed.sys.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i64, ptr %a unordered, align 8
   %a.add = add i64 %a.load, 1
@@ -360,10 +360,10 @@ define void @generic_unordered_sys_float(ptr %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.f32 %f1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_float_param_0];
+; SM60-NEXT:    ld.volatile.b32 %f1, [%rd1];
 ; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.f32 [%rd1], %f2;
+; SM60-NEXT:    st.volatile.b32 [%rd1], %f2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_unordered_sys_float(
@@ -372,10 +372,10 @@ define void @generic_unordered_sys_float(ptr %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_float_param_0];
-; SM70-NEXT:    ld.relaxed.sys.f32 %f1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_float_param_0];
+; SM70-NEXT:    ld.relaxed.sys.b32 %f1, [%rd1];
 ; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.relaxed.sys.f32 [%rd1], %f2;
+; SM70-NEXT:    st.relaxed.sys.b32 [%rd1], %f2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic float, ptr %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -390,10 +390,10 @@ define void @generic_unordered_sys_double(ptr %a) {
 ; SM60-NEXT:    .reg .b64 %fd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.f64 %fd1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_double_param_0];
+; SM60-NEXT:    ld.volatile.b64 %fd1, [%rd1];
 ; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.f64 [%rd1], %fd2;
+; SM60-NEXT:    st.volatile.b64 [%rd1], %fd2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_unordered_sys_double(
@@ -402,10 +402,10 @@ define void @generic_unordered_sys_double(ptr %a) {
 ; SM70-NEXT:    .reg .b64 %fd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_double_param_0];
-; SM70-NEXT:    ld.relaxed.sys.f64 %fd1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [generic_unordered_sys_double_param_0];
+; SM70-NEXT:    ld.relaxed.sys.b64 %fd1, [%rd1];
 ; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.relaxed.sys.f64 [%rd1], %fd2;
+; SM70-NEXT:    st.relaxed.sys.b64 [%rd1], %fd2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic double, ptr %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -422,10 +422,10 @@ define void @generic_unordered_volatile_sys_i8(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_unordered_volatile_sys_i8_param_0];
-; CHECK-NEXT:    ld.volatile.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_volatile_sys_i8_param_0];
+; CHECK-NEXT:    ld.volatile.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a unordered, align 1
   %a.add = add i8 %a.load, 1
@@ -440,10 +440,10 @@ define void @generic_unordered_volatile_sys_i16(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_unordered_volatile_sys_i16_param_0];
-; CHECK-NEXT:    ld.volatile.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_volatile_sys_i16_param_0];
+; CHECK-NEXT:    ld.volatile.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i16, ptr %a unordered, align 2
   %a.add = add i16 %a.load, 1
@@ -458,10 +458,10 @@ define void @generic_unordered_volatile_sys_i32(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_unordered_volatile_sys_i32_param_0];
-; CHECK-NEXT:    ld.volatile.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_volatile_sys_i32_param_0];
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.volatile.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.volatile.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i32, ptr %a unordered, align 4
   %a.add = add i32 %a.load, 1
@@ -475,10 +475,10 @@ define void @generic_unordered_volatile_sys_i64(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_unordered_volatile_sys_i64_param_0];
-; CHECK-NEXT:    ld.volatile.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_volatile_sys_i64_param_0];
+; CHECK-NEXT:    ld.volatile.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.volatile.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.volatile.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i64, ptr %a unordered, align 8
   %a.add = add i64 %a.load, 1
@@ -493,10 +493,10 @@ define void @generic_unordered_volatile_sys_float(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_unordered_volatile_sys_float_param_0];
-; CHECK-NEXT:    ld.volatile.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_volatile_sys_float_param_0];
+; CHECK-NEXT:    ld.volatile.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.volatile.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile float, ptr %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -511,10 +511,10 @@ define void @generic_unordered_volatile_sys_double(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_unordered_volatile_sys_double_param_0];
-; CHECK-NEXT:    ld.volatile.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_unordered_volatile_sys_double_param_0];
+; CHECK-NEXT:    ld.volatile.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.volatile.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile double, ptr %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -531,10 +531,10 @@ define void @generic_monotonic_sys_i8(ptr %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0];
-; SM60-NEXT:    ld.volatile.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_i8_param_0];
+; SM60-NEXT:    ld.volatile.b8 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.u8 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.b8 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_monotonic_sys_i8(
@@ -543,10 +543,10 @@ define void @generic_monotonic_sys_i8(ptr %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0];
-; SM70-NEXT:    ld.relaxed.sys.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_i8_param_0];
+; SM70-NEXT:    ld.relaxed.sys.b8 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.relaxed.sys.u8 [%rd1], %rs2;
+; SM70-NEXT:    st.relaxed.sys.b8 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i8, ptr %a monotonic, align 1
   %a.add = add i8 %a.load, 1
@@ -561,10 +561,10 @@ define void @generic_monotonic_sys_i16(ptr %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0];
-; SM60-NEXT:    ld.volatile.u16 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_i16_param_0];
+; SM60-NEXT:    ld.volatile.b16 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.u16 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.b16 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_monotonic_sys_i16(
@@ -573,10 +573,10 @@ define void @generic_monotonic_sys_i16(ptr %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0];
-; SM70-NEXT:    ld.relaxed.sys.u16 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_i16_param_0];
+; SM70-NEXT:    ld.relaxed.sys.b16 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.relaxed.sys.u16 [%rd1], %rs2;
+; SM70-NEXT:    st.relaxed.sys.b16 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i16, ptr %a monotonic, align 2
   %a.add = add i16 %a.load, 1
@@ -591,10 +591,10 @@ define void @generic_monotonic_sys_i32(ptr %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0];
-; SM60-NEXT:    ld.volatile.u32 %r1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_i32_param_0];
+; SM60-NEXT:    ld.volatile.b32 %r1, [%rd1];
 ; SM60-NEXT:    add.s32 %r2, %r1, 1;
-; SM60-NEXT:    st.volatile.u32 [%rd1], %r2;
+; SM60-NEXT:    st.volatile.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_monotonic_sys_i32(
@@ -603,10 +603,10 @@ define void @generic_monotonic_sys_i32(ptr %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0];
-; SM70-NEXT:    ld.relaxed.sys.u32 %r1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_i32_param_0];
+; SM70-NEXT:    ld.relaxed.sys.b32 %r1, [%rd1];
 ; SM70-NEXT:    add.s32 %r2, %r1, 1;
-; SM70-NEXT:    st.relaxed.sys.u32 [%rd1], %r2;
+; SM70-NEXT:    st.relaxed.sys.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i32, ptr %a monotonic, align 4
   %a.add = add i32 %a.load, 1
@@ -620,10 +620,10 @@ define void @generic_monotonic_sys_i64(ptr %a) {
 ; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0];
-; SM60-NEXT:    ld.volatile.u64 %rd2, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_i64_param_0];
+; SM60-NEXT:    ld.volatile.b64 %rd2, [%rd1];
 ; SM60-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM60-NEXT:    st.volatile.u64 [%rd1], %rd3;
+; SM60-NEXT:    st.volatile.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_monotonic_sys_i64(
@@ -631,10 +631,10 @@ define void @generic_monotonic_sys_i64(ptr %a) {
 ; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0];
-; SM70-NEXT:    ld.relaxed.sys.u64 %rd2, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_i64_param_0];
+; SM70-NEXT:    ld.relaxed.sys.b64 %rd2, [%rd1];
 ; SM70-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM70-NEXT:    st.relaxed.sys.u64 [%rd1], %rd3;
+; SM70-NEXT:    st.relaxed.sys.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i64, ptr %a monotonic, align 8
   %a.add = add i64 %a.load, 1
@@ -649,10 +649,10 @@ define void @generic_monotonic_sys_float(ptr %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.f32 %f1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_float_param_0];
+; SM60-NEXT:    ld.volatile.b32 %f1, [%rd1];
 ; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.f32 [%rd1], %f2;
+; SM60-NEXT:    st.volatile.b32 [%rd1], %f2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_monotonic_sys_float(
@@ -661,10 +661,10 @@ define void @generic_monotonic_sys_float(ptr %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0];
-; SM70-NEXT:    ld.relaxed.sys.f32 %f1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_float_param_0];
+; SM70-NEXT:    ld.relaxed.sys.b32 %f1, [%rd1];
 ; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.relaxed.sys.f32 [%rd1], %f2;
+; SM70-NEXT:    st.relaxed.sys.b32 [%rd1], %f2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic float, ptr %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -679,10 +679,10 @@ define void @generic_monotonic_sys_double(ptr %a) {
 ; SM60-NEXT:    .reg .b64 %fd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.f64 %fd1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_double_param_0];
+; SM60-NEXT:    ld.volatile.b64 %fd1, [%rd1];
 ; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.f64 [%rd1], %fd2;
+; SM60-NEXT:    st.volatile.b64 [%rd1], %fd2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: generic_monotonic_sys_double(
@@ -691,10 +691,10 @@ define void @generic_monotonic_sys_double(ptr %a) {
 ; SM70-NEXT:    .reg .b64 %fd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0];
-; SM70-NEXT:    ld.relaxed.sys.f64 %fd1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [generic_monotonic_sys_double_param_0];
+; SM70-NEXT:    ld.relaxed.sys.b64 %fd1, [%rd1];
 ; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.relaxed.sys.f64 [%rd1], %fd2;
+; SM70-NEXT:    st.relaxed.sys.b64 [%rd1], %fd2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic double, ptr %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -711,10 +711,10 @@ define void @generic_monotonic_volatile_sys_i8(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i8_param_0];
-; CHECK-NEXT:    ld.volatile.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_volatile_sys_i8_param_0];
+; CHECK-NEXT:    ld.volatile.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a monotonic, align 1
   %a.add = add i8 %a.load, 1
@@ -729,10 +729,10 @@ define void @generic_monotonic_volatile_sys_i16(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i16_param_0];
-; CHECK-NEXT:    ld.volatile.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_volatile_sys_i16_param_0];
+; CHECK-NEXT:    ld.volatile.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i16, ptr %a monotonic, align 2
   %a.add = add i16 %a.load, 1
@@ -747,10 +747,10 @@ define void @generic_monotonic_volatile_sys_i32(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i32_param_0];
-; CHECK-NEXT:    ld.volatile.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_volatile_sys_i32_param_0];
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.volatile.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.volatile.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i32, ptr %a monotonic, align 4
   %a.add = add i32 %a.load, 1
@@ -764,10 +764,10 @@ define void @generic_monotonic_volatile_sys_i64(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i64_param_0];
-; CHECK-NEXT:    ld.volatile.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_volatile_sys_i64_param_0];
+; CHECK-NEXT:    ld.volatile.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.volatile.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.volatile.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i64, ptr %a monotonic, align 8
   %a.add = add i64 %a.load, 1
@@ -782,10 +782,10 @@ define void @generic_monotonic_volatile_sys_float(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_monotonic_volatile_sys_float_param_0];
-; CHECK-NEXT:    ld.volatile.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_volatile_sys_float_param_0];
+; CHECK-NEXT:    ld.volatile.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.volatile.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile float, ptr %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -800,10 +800,10 @@ define void @generic_monotonic_volatile_sys_double(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_monotonic_volatile_sys_double_param_0];
-; CHECK-NEXT:    ld.volatile.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_monotonic_volatile_sys_double_param_0];
+; CHECK-NEXT:    ld.volatile.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.volatile.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile double, ptr %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -822,10 +822,10 @@ define void @global_i8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_i8_param_0];
-; CHECK-NEXT:    ld.global.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_i8_param_0];
+; CHECK-NEXT:    ld.global.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.global.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.global.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load i8, ptr addrspace(1) %a
   %a.add = add i8 %a.load, 1
@@ -840,10 +840,10 @@ define void @global_i16(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_i16_param_0];
-; CHECK-NEXT:    ld.global.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_i16_param_0];
+; CHECK-NEXT:    ld.global.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.global.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.global.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load i16, ptr addrspace(1) %a
   %a.add = add i16 %a.load, 1
@@ -858,10 +858,10 @@ define void @global_i32(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_i32_param_0];
-; CHECK-NEXT:    ld.global.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_i32_param_0];
+; CHECK-NEXT:    ld.global.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.global.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.global.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load i32, ptr addrspace(1) %a
   %a.add = add i32 %a.load, 1
@@ -875,10 +875,10 @@ define void @global_i64(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_i64_param_0];
-; CHECK-NEXT:    ld.global.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_i64_param_0];
+; CHECK-NEXT:    ld.global.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.global.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.global.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load i64, ptr addrspace(1) %a
   %a.add = add i64 %a.load, 1
@@ -893,10 +893,10 @@ define void @global_float(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_float_param_0];
-; CHECK-NEXT:    ld.global.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_float_param_0];
+; CHECK-NEXT:    ld.global.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.global.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.global.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load float, ptr addrspace(1) %a
   %a.add = fadd float %a.load, 1.
@@ -911,10 +911,10 @@ define void @global_double(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_double_param_0];
-; CHECK-NEXT:    ld.global.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_double_param_0];
+; CHECK-NEXT:    ld.global.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.global.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.global.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load double, ptr addrspace(1) %a
   %a.add = fadd double %a.load, 1.
@@ -931,10 +931,10 @@ define void @global_volatile_i8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_i8_param_0];
-; CHECK-NEXT:    ld.volatile.global.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_i8_param_0];
+; CHECK-NEXT:    ld.volatile.global.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.global.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.global.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i8, ptr addrspace(1) %a
   %a.add = add i8 %a.load, 1
@@ -949,10 +949,10 @@ define void @global_volatile_i16(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_i16_param_0];
-; CHECK-NEXT:    ld.volatile.global.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_i16_param_0];
+; CHECK-NEXT:    ld.volatile.global.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.global.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.global.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i16, ptr addrspace(1) %a
   %a.add = add i16 %a.load, 1
@@ -967,10 +967,10 @@ define void @global_volatile_i32(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_i32_param_0];
-; CHECK-NEXT:    ld.volatile.global.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_i32_param_0];
+; CHECK-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.volatile.global.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.volatile.global.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i32, ptr addrspace(1) %a
   %a.add = add i32 %a.load, 1
@@ -984,10 +984,10 @@ define void @global_volatile_i64(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_i64_param_0];
-; CHECK-NEXT:    ld.volatile.global.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_i64_param_0];
+; CHECK-NEXT:    ld.volatile.global.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.volatile.global.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.volatile.global.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i64, ptr addrspace(1) %a
   %a.add = add i64 %a.load, 1
@@ -1002,10 +1002,10 @@ define void @global_volatile_float(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_float_param_0];
-; CHECK-NEXT:    ld.volatile.global.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_float_param_0];
+; CHECK-NEXT:    ld.volatile.global.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.global.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.volatile.global.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile float, ptr addrspace(1) %a
   %a.add = fadd float %a.load, 1.
@@ -1020,10 +1020,10 @@ define void @global_volatile_double(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_double_param_0];
-; CHECK-NEXT:    ld.volatile.global.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_double_param_0];
+; CHECK-NEXT:    ld.volatile.global.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.global.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.volatile.global.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile double, ptr addrspace(1) %a
   %a.add = fadd double %a.load, 1.
@@ -1040,10 +1040,10 @@ define void @global_unordered_sys_i8(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_i8_param_0];
-; SM60-NEXT:    ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_i8_param_0];
+; SM60-NEXT:    ld.volatile.global.b8 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.global.b8 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_sys_i8(
@@ -1052,10 +1052,10 @@ define void @global_unordered_sys_i8(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_i8_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_i8_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.b8 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT:    st.relaxed.sys.global.b8 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
   %a.add = add i8 %a.load, 1
@@ -1070,10 +1070,10 @@ define void @global_unordered_sys_i16(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_i16_param_0];
-; SM60-NEXT:    ld.volatile.global.u16 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_i16_param_0];
+; SM60-NEXT:    ld.volatile.global.b16 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.global.u16 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.global.b16 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_sys_i16(
@@ -1082,10 +1082,10 @@ define void @global_unordered_sys_i16(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_i16_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.u16 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_i16_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.b16 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.relaxed.sys.global.u16 [%rd1], %rs2;
+; SM70-NEXT:    st.relaxed.sys.global.b16 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i16, ptr addrspace(1) %a unordered, align 2
   %a.add = add i16 %a.load, 1
@@ -1100,10 +1100,10 @@ define void @global_unordered_sys_i32(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_i32_param_0];
-; SM60-NEXT:    ld.volatile.global.u32 %r1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_i32_param_0];
+; SM60-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
 ; SM60-NEXT:    add.s32 %r2, %r1, 1;
-; SM60-NEXT:    st.volatile.global.u32 [%rd1], %r2;
+; SM60-NEXT:    st.volatile.global.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_sys_i32(
@@ -1112,10 +1112,10 @@ define void @global_unordered_sys_i32(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_i32_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.u32 %r1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_i32_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.b32 %r1, [%rd1];
 ; SM70-NEXT:    add.s32 %r2, %r1, 1;
-; SM70-NEXT:    st.relaxed.sys.global.u32 [%rd1], %r2;
+; SM70-NEXT:    st.relaxed.sys.global.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i32, ptr addrspace(1) %a unordered, align 4
   %a.add = add i32 %a.load, 1
@@ -1129,10 +1129,10 @@ define void @global_unordered_sys_i64(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_i64_param_0];
-; SM60-NEXT:    ld.volatile.global.u64 %rd2, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_i64_param_0];
+; SM60-NEXT:    ld.volatile.global.b64 %rd2, [%rd1];
 ; SM60-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM60-NEXT:    st.volatile.global.u64 [%rd1], %rd3;
+; SM60-NEXT:    st.volatile.global.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_sys_i64(
@@ -1140,10 +1140,10 @@ define void @global_unordered_sys_i64(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_i64_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.u64 %rd2, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_i64_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.b64 %rd2, [%rd1];
 ; SM70-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM70-NEXT:    st.relaxed.sys.global.u64 [%rd1], %rd3;
+; SM70-NEXT:    st.relaxed.sys.global.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i64, ptr addrspace(1) %a unordered, align 8
   %a.add = add i64 %a.load, 1
@@ -1158,10 +1158,10 @@ define void @global_unordered_sys_float(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.global.f32 %f1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_float_param_0];
+; SM60-NEXT:    ld.volatile.global.b32 %f1, [%rd1];
 ; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.global.f32 [%rd1], %f2;
+; SM60-NEXT:    st.volatile.global.b32 [%rd1], %f2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_sys_float(
@@ -1170,10 +1170,10 @@ define void @global_unordered_sys_float(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_float_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.f32 %f1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_float_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.b32 %f1, [%rd1];
 ; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.relaxed.sys.global.f32 [%rd1], %f2;
+; SM70-NEXT:    st.relaxed.sys.global.b32 [%rd1], %f2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic float, ptr addrspace(1) %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -1188,10 +1188,10 @@ define void @global_unordered_sys_double(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %fd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.global.f64 %fd1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_double_param_0];
+; SM60-NEXT:    ld.volatile.global.b64 %fd1, [%rd1];
 ; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.global.f64 [%rd1], %fd2;
+; SM60-NEXT:    st.volatile.global.b64 [%rd1], %fd2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_sys_double(
@@ -1200,10 +1200,10 @@ define void @global_unordered_sys_double(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %fd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_double_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.f64 %fd1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_sys_double_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.b64 %fd1, [%rd1];
 ; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.relaxed.sys.global.f64 [%rd1], %fd2;
+; SM70-NEXT:    st.relaxed.sys.global.b64 [%rd1], %fd2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic double, ptr addrspace(1) %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -1220,10 +1220,10 @@ define void @global_unordered_volatile_sys_i8(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0];
-; SM60-NEXT:    ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_i8_param_0];
+; SM60-NEXT:    ld.volatile.global.b8 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.global.b8 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_volatile_sys_i8(
@@ -1232,10 +1232,10 @@ define void @global_unordered_volatile_sys_i8(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_i8_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b8 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b8 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
   %a.add = add i8 %a.load, 1
@@ -1250,10 +1250,10 @@ define void @global_unordered_volatile_sys_i16(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0];
-; SM60-NEXT:    ld.volatile.global.u16 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_i16_param_0];
+; SM60-NEXT:    ld.volatile.global.b16 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.global.u16 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.global.b16 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_volatile_sys_i16(
@@ -1262,10 +1262,10 @@ define void @global_unordered_volatile_sys_i16(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_i16_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b16 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.u16 [%rd1], %rs2;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b16 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile i16, ptr addrspace(1) %a unordered, align 2
   %a.add = add i16 %a.load, 1
@@ -1280,10 +1280,10 @@ define void @global_unordered_volatile_sys_i32(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0];
-; SM60-NEXT:    ld.volatile.global.u32 %r1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_i32_param_0];
+; SM60-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
 ; SM60-NEXT:    add.s32 %r2, %r1, 1;
-; SM60-NEXT:    st.volatile.global.u32 [%rd1], %r2;
+; SM60-NEXT:    st.volatile.global.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_volatile_sys_i32(
@@ -1292,10 +1292,10 @@ define void @global_unordered_volatile_sys_i32(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.u32 %r1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_i32_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b32 %r1, [%rd1];
 ; SM70-NEXT:    add.s32 %r2, %r1, 1;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.u32 [%rd1], %r2;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile i32, ptr addrspace(1) %a unordered, align 4
   %a.add = add i32 %a.load, 1
@@ -1309,10 +1309,10 @@ define void @global_unordered_volatile_sys_i64(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0];
-; SM60-NEXT:    ld.volatile.global.u64 %rd2, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_i64_param_0];
+; SM60-NEXT:    ld.volatile.global.b64 %rd2, [%rd1];
 ; SM60-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM60-NEXT:    st.volatile.global.u64 [%rd1], %rd3;
+; SM60-NEXT:    st.volatile.global.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_volatile_sys_i64(
@@ -1320,10 +1320,10 @@ define void @global_unordered_volatile_sys_i64(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_i64_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b64 %rd2, [%rd1];
 ; SM70-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.u64 [%rd1], %rd3;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile i64, ptr addrspace(1) %a unordered, align 8
   %a.add = add i64 %a.load, 1
@@ -1338,10 +1338,10 @@ define void @global_unordered_volatile_sys_float(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.global.f32 %f1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_float_param_0];
+; SM60-NEXT:    ld.volatile.global.b32 %f1, [%rd1];
 ; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.global.f32 [%rd1], %f2;
+; SM60-NEXT:    st.volatile.global.b32 [%rd1], %f2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_volatile_sys_float(
@@ -1350,10 +1350,10 @@ define void @global_unordered_volatile_sys_float(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.f32 %f1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_float_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b32 %f1, [%rd1];
 ; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.f32 [%rd1], %f2;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd1], %f2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile float, ptr addrspace(1) %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -1368,10 +1368,10 @@ define void @global_unordered_volatile_sys_double(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %fd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.global.f64 %fd1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_double_param_0];
+; SM60-NEXT:    ld.volatile.global.b64 %fd1, [%rd1];
 ; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.global.f64 [%rd1], %fd2;
+; SM60-NEXT:    st.volatile.global.b64 [%rd1], %fd2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_unordered_volatile_sys_double(
@@ -1380,10 +1380,10 @@ define void @global_unordered_volatile_sys_double(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %fd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_unordered_volatile_sys_double_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b64 %fd1, [%rd1];
 ; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.f64 [%rd1], %fd2;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd1], %fd2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile double, ptr addrspace(1) %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -1400,10 +1400,10 @@ define void @global_monotonic_sys_i8(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0];
-; SM60-NEXT:    ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_i8_param_0];
+; SM60-NEXT:    ld.volatile.global.b8 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.global.b8 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_sys_i8(
@@ -1412,10 +1412,10 @@ define void @global_monotonic_sys_i8(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_i8_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.b8 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT:    st.relaxed.sys.global.b8 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
   %a.add = add i8 %a.load, 1
@@ -1430,10 +1430,10 @@ define void @global_monotonic_sys_i16(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0];
-; SM60-NEXT:    ld.volatile.global.u16 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_i16_param_0];
+; SM60-NEXT:    ld.volatile.global.b16 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.global.u16 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.global.b16 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_sys_i16(
@@ -1442,10 +1442,10 @@ define void @global_monotonic_sys_i16(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.u16 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_i16_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.b16 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.relaxed.sys.global.u16 [%rd1], %rs2;
+; SM70-NEXT:    st.relaxed.sys.global.b16 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i16, ptr addrspace(1) %a monotonic, align 2
   %a.add = add i16 %a.load, 1
@@ -1460,10 +1460,10 @@ define void @global_monotonic_sys_i32(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0];
-; SM60-NEXT:    ld.volatile.global.u32 %r1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_i32_param_0];
+; SM60-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
 ; SM60-NEXT:    add.s32 %r2, %r1, 1;
-; SM60-NEXT:    st.volatile.global.u32 [%rd1], %r2;
+; SM60-NEXT:    st.volatile.global.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_sys_i32(
@@ -1472,10 +1472,10 @@ define void @global_monotonic_sys_i32(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.u32 %r1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_i32_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.b32 %r1, [%rd1];
 ; SM70-NEXT:    add.s32 %r2, %r1, 1;
-; SM70-NEXT:    st.relaxed.sys.global.u32 [%rd1], %r2;
+; SM70-NEXT:    st.relaxed.sys.global.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i32, ptr addrspace(1) %a monotonic, align 4
   %a.add = add i32 %a.load, 1
@@ -1489,10 +1489,10 @@ define void @global_monotonic_sys_i64(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0];
-; SM60-NEXT:    ld.volatile.global.u64 %rd2, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_i64_param_0];
+; SM60-NEXT:    ld.volatile.global.b64 %rd2, [%rd1];
 ; SM60-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM60-NEXT:    st.volatile.global.u64 [%rd1], %rd3;
+; SM60-NEXT:    st.volatile.global.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_sys_i64(
@@ -1500,10 +1500,10 @@ define void @global_monotonic_sys_i64(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.u64 %rd2, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_i64_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.b64 %rd2, [%rd1];
 ; SM70-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM70-NEXT:    st.relaxed.sys.global.u64 [%rd1], %rd3;
+; SM70-NEXT:    st.relaxed.sys.global.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i64, ptr addrspace(1) %a monotonic, align 8
   %a.add = add i64 %a.load, 1
@@ -1518,10 +1518,10 @@ define void @global_monotonic_sys_float(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.global.f32 %f1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_float_param_0];
+; SM60-NEXT:    ld.volatile.global.b32 %f1, [%rd1];
 ; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.global.f32 [%rd1], %f2;
+; SM60-NEXT:    st.volatile.global.b32 [%rd1], %f2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_sys_float(
@@ -1530,10 +1530,10 @@ define void @global_monotonic_sys_float(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_float_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.f32 %f1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_float_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.b32 %f1, [%rd1];
 ; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.relaxed.sys.global.f32 [%rd1], %f2;
+; SM70-NEXT:    st.relaxed.sys.global.b32 [%rd1], %f2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic float, ptr addrspace(1) %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -1548,10 +1548,10 @@ define void @global_monotonic_sys_double(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %fd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.global.f64 %fd1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_double_param_0];
+; SM60-NEXT:    ld.volatile.global.b64 %fd1, [%rd1];
 ; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.global.f64 [%rd1], %fd2;
+; SM60-NEXT:    st.volatile.global.b64 [%rd1], %fd2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_sys_double(
@@ -1560,10 +1560,10 @@ define void @global_monotonic_sys_double(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %fd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_double_param_0];
-; SM70-NEXT:    ld.relaxed.sys.global.f64 %fd1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_sys_double_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.b64 %fd1, [%rd1];
 ; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.relaxed.sys.global.f64 [%rd1], %fd2;
+; SM70-NEXT:    st.relaxed.sys.global.b64 [%rd1], %fd2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic double, ptr addrspace(1) %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -1580,10 +1580,10 @@ define void @global_monotonic_volatile_sys_i8(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0];
-; SM60-NEXT:    ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_i8_param_0];
+; SM60-NEXT:    ld.volatile.global.b8 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.global.b8 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_volatile_sys_i8(
@@ -1592,10 +1592,10 @@ define void @global_monotonic_volatile_sys_i8(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_i8_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b8 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b8 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
   %a.add = add i8 %a.load, 1
@@ -1610,10 +1610,10 @@ define void @global_monotonic_volatile_sys_i16(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0];
-; SM60-NEXT:    ld.volatile.global.u16 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_i16_param_0];
+; SM60-NEXT:    ld.volatile.global.b16 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.global.u16 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.global.b16 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_volatile_sys_i16(
@@ -1622,10 +1622,10 @@ define void @global_monotonic_volatile_sys_i16(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_i16_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b16 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.u16 [%rd1], %rs2;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b16 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile i16, ptr addrspace(1) %a monotonic, align 2
   %a.add = add i16 %a.load, 1
@@ -1640,10 +1640,10 @@ define void @global_monotonic_volatile_sys_i32(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0];
-; SM60-NEXT:    ld.volatile.global.u32 %r1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_i32_param_0];
+; SM60-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
 ; SM60-NEXT:    add.s32 %r2, %r1, 1;
-; SM60-NEXT:    st.volatile.global.u32 [%rd1], %r2;
+; SM60-NEXT:    st.volatile.global.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_volatile_sys_i32(
@@ -1652,10 +1652,10 @@ define void @global_monotonic_volatile_sys_i32(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.u32 %r1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_i32_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b32 %r1, [%rd1];
 ; SM70-NEXT:    add.s32 %r2, %r1, 1;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.u32 [%rd1], %r2;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile i32, ptr addrspace(1) %a monotonic, align 4
   %a.add = add i32 %a.load, 1
@@ -1669,10 +1669,10 @@ define void @global_monotonic_volatile_sys_i64(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0];
-; SM60-NEXT:    ld.volatile.global.u64 %rd2, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_i64_param_0];
+; SM60-NEXT:    ld.volatile.global.b64 %rd2, [%rd1];
 ; SM60-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM60-NEXT:    st.volatile.global.u64 [%rd1], %rd3;
+; SM60-NEXT:    st.volatile.global.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_volatile_sys_i64(
@@ -1680,10 +1680,10 @@ define void @global_monotonic_volatile_sys_i64(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_i64_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b64 %rd2, [%rd1];
 ; SM70-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.u64 [%rd1], %rd3;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile i64, ptr addrspace(1) %a monotonic, align 8
   %a.add = add i64 %a.load, 1
@@ -1698,10 +1698,10 @@ define void @global_monotonic_volatile_sys_float(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.global.f32 %f1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_float_param_0];
+; SM60-NEXT:    ld.volatile.global.b32 %f1, [%rd1];
 ; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.global.f32 [%rd1], %f2;
+; SM60-NEXT:    st.volatile.global.b32 [%rd1], %f2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_volatile_sys_float(
@@ -1710,10 +1710,10 @@ define void @global_monotonic_volatile_sys_float(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.f32 %f1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_float_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b32 %f1, [%rd1];
 ; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.f32 [%rd1], %f2;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b32 [%rd1], %f2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile float, ptr addrspace(1) %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -1728,10 +1728,10 @@ define void @global_monotonic_volatile_sys_double(ptr addrspace(1) %a) {
 ; SM60-NEXT:    .reg .b64 %fd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.global.f64 %fd1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_double_param_0];
+; SM60-NEXT:    ld.volatile.global.b64 %fd1, [%rd1];
 ; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.global.f64 [%rd1], %fd2;
+; SM60-NEXT:    st.volatile.global.b64 [%rd1], %fd2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: global_monotonic_volatile_sys_double(
@@ -1740,10 +1740,10 @@ define void @global_monotonic_volatile_sys_double(ptr addrspace(1) %a) {
 ; SM70-NEXT:    .reg .b64 %fd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0];
-; SM70-NEXT:    ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [global_monotonic_volatile_sys_double_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.b64 %fd1, [%rd1];
 ; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.mmio.relaxed.sys.global.f64 [%rd1], %fd2;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.b64 [%rd1], %fd2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic volatile double, ptr addrspace(1) %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -1762,10 +1762,10 @@ define void @shared_i8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_i8_param_0];
-; CHECK-NEXT:    ld.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_i8_param_0];
+; CHECK-NEXT:    ld.shared.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.shared.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load i8, ptr addrspace(3) %a
   %a.add = add i8 %a.load, 1
@@ -1780,10 +1780,10 @@ define void @shared_i16(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_i16_param_0];
-; CHECK-NEXT:    ld.shared.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_i16_param_0];
+; CHECK-NEXT:    ld.shared.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.shared.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.shared.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load i16, ptr addrspace(3) %a
   %a.add = add i16 %a.load, 1
@@ -1798,10 +1798,10 @@ define void @shared_i32(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_i32_param_0];
-; CHECK-NEXT:    ld.shared.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_i32_param_0];
+; CHECK-NEXT:    ld.shared.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.shared.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.shared.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load i32, ptr addrspace(3) %a
   %a.add = add i32 %a.load, 1
@@ -1815,10 +1815,10 @@ define void @shared_i64(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_i64_param_0];
-; CHECK-NEXT:    ld.shared.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_i64_param_0];
+; CHECK-NEXT:    ld.shared.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.shared.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.shared.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load i64, ptr addrspace(3) %a
   %a.add = add i64 %a.load, 1
@@ -1833,10 +1833,10 @@ define void @shared_float(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_float_param_0];
-; CHECK-NEXT:    ld.shared.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_float_param_0];
+; CHECK-NEXT:    ld.shared.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.shared.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.shared.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load float, ptr addrspace(3) %a
   %a.add = fadd float %a.load, 1.
@@ -1851,10 +1851,10 @@ define void @shared_double(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_double_param_0];
-; CHECK-NEXT:    ld.shared.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_double_param_0];
+; CHECK-NEXT:    ld.shared.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.shared.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.shared.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load double, ptr addrspace(3) %a
   %a.add = fadd double %a.load, 1.
@@ -1871,10 +1871,10 @@ define void @shared_volatile_i8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_i8_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_i8_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.shared.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i8, ptr addrspace(3) %a
   %a.add = add i8 %a.load, 1
@@ -1889,10 +1889,10 @@ define void @shared_volatile_i16(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_i16_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_i16_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.shared.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.shared.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i16, ptr addrspace(3) %a
   %a.add = add i16 %a.load, 1
@@ -1907,10 +1907,10 @@ define void @shared_volatile_i32(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_i32_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_i32_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.volatile.shared.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i32, ptr addrspace(3) %a
   %a.add = add i32 %a.load, 1
@@ -1924,10 +1924,10 @@ define void @shared_volatile_i64(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_i64_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_i64_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.volatile.shared.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i64, ptr addrspace(3) %a
   %a.add = add i64 %a.load, 1
@@ -1942,10 +1942,10 @@ define void @shared_volatile_float(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_float_param_0];
-; CHECK-NEXT:    ld.volatile.shared.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_float_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.shared.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile float, ptr addrspace(3) %a
   %a.add = fadd float %a.load, 1.
@@ -1960,10 +1960,10 @@ define void @shared_volatile_double(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_double_param_0];
-; CHECK-NEXT:    ld.volatile.shared.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_double_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.shared.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile double, ptr addrspace(3) %a
   %a.add = fadd double %a.load, 1.
@@ -1980,10 +1980,10 @@ define void @shared_unordered_sys_i8(ptr addrspace(3) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0];
-; SM60-NEXT:    ld.volatile.shared.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_i8_param_0];
+; SM60-NEXT:    ld.volatile.shared.b8 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.shared.u8 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.shared.b8 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_unordered_sys_i8(
@@ -1992,10 +1992,10 @@ define void @shared_unordered_sys_i8(ptr addrspace(3) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_i8_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.b8 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.relaxed.sys.shared.u8 [%rd1], %rs2;
+; SM70-NEXT:    st.relaxed.sys.shared.b8 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
   %a.add = add i8 %a.load, 1
@@ -2010,10 +2010,10 @@ define void @shared_unordered_sys_i16(ptr addrspace(3) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0];
-; SM60-NEXT:    ld.volatile.shared.u16 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_i16_param_0];
+; SM60-NEXT:    ld.volatile.shared.b16 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.shared.u16 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.shared.b16 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_unordered_sys_i16(
@@ -2022,10 +2022,10 @@ define void @shared_unordered_sys_i16(ptr addrspace(3) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.u16 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_i16_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.b16 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.relaxed.sys.shared.u16 [%rd1], %rs2;
+; SM70-NEXT:    st.relaxed.sys.shared.b16 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i16, ptr addrspace(3) %a unordered, align 2
   %a.add = add i16 %a.load, 1
@@ -2040,10 +2040,10 @@ define void @shared_unordered_sys_i32(ptr addrspace(3) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0];
-; SM60-NEXT:    ld.volatile.shared.u32 %r1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_i32_param_0];
+; SM60-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
 ; SM60-NEXT:    add.s32 %r2, %r1, 1;
-; SM60-NEXT:    st.volatile.shared.u32 [%rd1], %r2;
+; SM60-NEXT:    st.volatile.shared.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_unordered_sys_i32(
@@ -2052,10 +2052,10 @@ define void @shared_unordered_sys_i32(ptr addrspace(3) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.u32 %r1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_i32_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.b32 %r1, [%rd1];
 ; SM70-NEXT:    add.s32 %r2, %r1, 1;
-; SM70-NEXT:    st.relaxed.sys.shared.u32 [%rd1], %r2;
+; SM70-NEXT:    st.relaxed.sys.shared.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i32, ptr addrspace(3) %a unordered, align 4
   %a.add = add i32 %a.load, 1
@@ -2069,10 +2069,10 @@ define void @shared_unordered_sys_i64(ptr addrspace(3) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0];
-; SM60-NEXT:    ld.volatile.shared.u64 %rd2, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_i64_param_0];
+; SM60-NEXT:    ld.volatile.shared.b64 %rd2, [%rd1];
 ; SM60-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM60-NEXT:    st.volatile.shared.u64 [%rd1], %rd3;
+; SM60-NEXT:    st.volatile.shared.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_unordered_sys_i64(
@@ -2080,10 +2080,10 @@ define void @shared_unordered_sys_i64(ptr addrspace(3) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.u64 %rd2, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_i64_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.b64 %rd2, [%rd1];
 ; SM70-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM70-NEXT:    st.relaxed.sys.shared.u64 [%rd1], %rd3;
+; SM70-NEXT:    st.relaxed.sys.shared.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i64, ptr addrspace(3) %a unordered, align 8
   %a.add = add i64 %a.load, 1
@@ -2098,10 +2098,10 @@ define void @shared_unordered_sys_float(ptr addrspace(3) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.shared.f32 %f1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_float_param_0];
+; SM60-NEXT:    ld.volatile.shared.b32 %f1, [%rd1];
 ; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.shared.f32 [%rd1], %f2;
+; SM60-NEXT:    st.volatile.shared.b32 [%rd1], %f2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_unordered_sys_float(
@@ -2110,10 +2110,10 @@ define void @shared_unordered_sys_float(ptr addrspace(3) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_float_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.f32 %f1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_float_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.b32 %f1, [%rd1];
 ; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.relaxed.sys.shared.f32 [%rd1], %f2;
+; SM70-NEXT:    st.relaxed.sys.shared.b32 [%rd1], %f2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic float, ptr addrspace(3) %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -2128,10 +2128,10 @@ define void @shared_unordered_sys_double(ptr addrspace(3) %a) {
 ; SM60-NEXT:    .reg .b64 %fd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.shared.f64 %fd1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_double_param_0];
+; SM60-NEXT:    ld.volatile.shared.b64 %fd1, [%rd1];
 ; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.shared.f64 [%rd1], %fd2;
+; SM60-NEXT:    st.volatile.shared.b64 [%rd1], %fd2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_unordered_sys_double(
@@ -2140,10 +2140,10 @@ define void @shared_unordered_sys_double(ptr addrspace(3) %a) {
 ; SM70-NEXT:    .reg .b64 %fd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_double_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.f64 %fd1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [shared_unordered_sys_double_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.b64 %fd1, [%rd1];
 ; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.relaxed.sys.shared.f64 [%rd1], %fd2;
+; SM70-NEXT:    st.relaxed.sys.shared.b64 [%rd1], %fd2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic double, ptr addrspace(3) %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -2160,10 +2160,10 @@ define void @shared_unordered_volatile_sys_i8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_unordered_volatile_sys_i8_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_volatile_sys_i8_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.shared.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
   %a.add = add i8 %a.load, 1
@@ -2178,10 +2178,10 @@ define void @shared_unordered_volatile_sys_i16(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_unordered_volatile_sys_i16_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_volatile_sys_i16_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.shared.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.shared.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i16, ptr addrspace(3) %a unordered, align 2
   %a.add = add i16 %a.load, 1
@@ -2196,10 +2196,10 @@ define void @shared_unordered_volatile_sys_i32(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_unordered_volatile_sys_i32_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_volatile_sys_i32_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.volatile.shared.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i32, ptr addrspace(3) %a unordered, align 4
   %a.add = add i32 %a.load, 1
@@ -2213,10 +2213,10 @@ define void @shared_unordered_volatile_sys_i64(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_unordered_volatile_sys_i64_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_volatile_sys_i64_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.volatile.shared.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i64, ptr addrspace(3) %a unordered, align 8
   %a.add = add i64 %a.load, 1
@@ -2231,10 +2231,10 @@ define void @shared_unordered_volatile_sys_float(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_unordered_volatile_sys_float_param_0];
-; CHECK-NEXT:    ld.volatile.shared.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_volatile_sys_float_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.shared.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile float, ptr addrspace(3) %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -2249,10 +2249,10 @@ define void @shared_unordered_volatile_sys_double(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_unordered_volatile_sys_double_param_0];
-; CHECK-NEXT:    ld.volatile.shared.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_unordered_volatile_sys_double_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.shared.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile double, ptr addrspace(3) %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -2269,10 +2269,10 @@ define void @shared_monotonic_sys_i8(ptr addrspace(3) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0];
-; SM60-NEXT:    ld.volatile.shared.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_i8_param_0];
+; SM60-NEXT:    ld.volatile.shared.b8 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.shared.u8 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.shared.b8 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_monotonic_sys_i8(
@@ -2281,10 +2281,10 @@ define void @shared_monotonic_sys_i8(ptr addrspace(3) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_i8_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.b8 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.relaxed.sys.shared.u8 [%rd1], %rs2;
+; SM70-NEXT:    st.relaxed.sys.shared.b8 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
   %a.add = add i8 %a.load, 1
@@ -2299,10 +2299,10 @@ define void @shared_monotonic_sys_i16(ptr addrspace(3) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0];
-; SM60-NEXT:    ld.volatile.shared.u16 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_i16_param_0];
+; SM60-NEXT:    ld.volatile.shared.b16 %rs1, [%rd1];
 ; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM60-NEXT:    st.volatile.shared.u16 [%rd1], %rs2;
+; SM60-NEXT:    st.volatile.shared.b16 [%rd1], %rs2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_monotonic_sys_i16(
@@ -2311,10 +2311,10 @@ define void @shared_monotonic_sys_i16(ptr addrspace(3) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.u16 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_i16_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.b16 %rs1, [%rd1];
 ; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
-; SM70-NEXT:    st.relaxed.sys.shared.u16 [%rd1], %rs2;
+; SM70-NEXT:    st.relaxed.sys.shared.b16 [%rd1], %rs2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i16, ptr addrspace(3) %a monotonic, align 2
   %a.add = add i16 %a.load, 1
@@ -2329,10 +2329,10 @@ define void @shared_monotonic_sys_i32(ptr addrspace(3) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0];
-; SM60-NEXT:    ld.volatile.shared.u32 %r1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_i32_param_0];
+; SM60-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
 ; SM60-NEXT:    add.s32 %r2, %r1, 1;
-; SM60-NEXT:    st.volatile.shared.u32 [%rd1], %r2;
+; SM60-NEXT:    st.volatile.shared.b32 [%rd1], %r2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_monotonic_sys_i32(
@@ -2341,10 +2341,10 @@ define void @shared_monotonic_sys_i32(ptr addrspace(3) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.u32 %r1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_i32_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.b32 %r1, [%rd1];
 ; SM70-NEXT:    add.s32 %r2, %r1, 1;
-; SM70-NEXT:    st.relaxed.sys.shared.u32 [%rd1], %r2;
+; SM70-NEXT:    st.relaxed.sys.shared.b32 [%rd1], %r2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i32, ptr addrspace(3) %a monotonic, align 4
   %a.add = add i32 %a.load, 1
@@ -2358,10 +2358,10 @@ define void @shared_monotonic_sys_i64(ptr addrspace(3) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<4>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0];
-; SM60-NEXT:    ld.volatile.shared.u64 %rd2, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_i64_param_0];
+; SM60-NEXT:    ld.volatile.shared.b64 %rd2, [%rd1];
 ; SM60-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM60-NEXT:    st.volatile.shared.u64 [%rd1], %rd3;
+; SM60-NEXT:    st.volatile.shared.b64 [%rd1], %rd3;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_monotonic_sys_i64(
@@ -2369,10 +2369,10 @@ define void @shared_monotonic_sys_i64(ptr addrspace(3) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.u64 %rd2, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_i64_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.b64 %rd2, [%rd1];
 ; SM70-NEXT:    add.s64 %rd3, %rd2, 1;
-; SM70-NEXT:    st.relaxed.sys.shared.u64 [%rd1], %rd3;
+; SM70-NEXT:    st.relaxed.sys.shared.b64 [%rd1], %rd3;
 ; SM70-NEXT:    ret;
   %a.load = load atomic i64, ptr addrspace(3) %a monotonic, align 8
   %a.add = add i64 %a.load, 1
@@ -2387,10 +2387,10 @@ define void @shared_monotonic_sys_float(ptr addrspace(3) %a) {
 ; SM60-NEXT:    .reg .b64 %rd<2>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0];
-; SM60-NEXT:    ld.volatile.shared.f32 %f1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_float_param_0];
+; SM60-NEXT:    ld.volatile.shared.b32 %f1, [%rd1];
 ; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM60-NEXT:    st.volatile.shared.f32 [%rd1], %f2;
+; SM60-NEXT:    st.volatile.shared.b32 [%rd1], %f2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_monotonic_sys_float(
@@ -2399,10 +2399,10 @@ define void @shared_monotonic_sys_float(ptr addrspace(3) %a) {
 ; SM70-NEXT:    .reg .b64 %rd<2>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.f32 %f1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_float_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.b32 %f1, [%rd1];
 ; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; SM70-NEXT:    st.relaxed.sys.shared.f32 [%rd1], %f2;
+; SM70-NEXT:    st.relaxed.sys.shared.b32 [%rd1], %f2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic float, ptr addrspace(3) %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -2417,10 +2417,10 @@ define void @shared_monotonic_sys_double(ptr addrspace(3) %a) {
 ; SM60-NEXT:    .reg .b64 %fd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
-; SM60-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0];
-; SM60-NEXT:    ld.volatile.shared.f64 %fd1, [%rd1];
+; SM60-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_double_param_0];
+; SM60-NEXT:    ld.volatile.shared.b64 %fd1, [%rd1];
 ; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM60-NEXT:    st.volatile.shared.f64 [%rd1], %fd2;
+; SM60-NEXT:    st.volatile.shared.b64 [%rd1], %fd2;
 ; SM60-NEXT:    ret;
 ;
 ; SM70-LABEL: shared_monotonic_sys_double(
@@ -2429,10 +2429,10 @@ define void @shared_monotonic_sys_double(ptr addrspace(3) %a) {
 ; SM70-NEXT:    .reg .b64 %fd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0];
-; SM70-NEXT:    ld.relaxed.sys.shared.f64 %fd1, [%rd1];
+; SM70-NEXT:    ld.param.b64 %rd1, [shared_monotonic_sys_double_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.b64 %fd1, [%rd1];
 ; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; SM70-NEXT:    st.relaxed.sys.shared.f64 [%rd1], %fd2;
+; SM70-NEXT:    st.relaxed.sys.shared.b64 [%rd1], %fd2;
 ; SM70-NEXT:    ret;
   %a.load = load atomic double, ptr addrspace(3) %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -2449,10 +2449,10 @@ define void @shared_monotonic_volatile_sys_i8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i8_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_volatile_sys_i8_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.shared.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
   %a.add = add i8 %a.load, 1
@@ -2467,10 +2467,10 @@ define void @shared_monotonic_volatile_sys_i16(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i16_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_volatile_sys_i16_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.volatile.shared.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.volatile.shared.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i16, ptr addrspace(3) %a monotonic, align 2
   %a.add = add i16 %a.load, 1
@@ -2485,10 +2485,10 @@ define void @shared_monotonic_volatile_sys_i32(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i32_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_volatile_sys_i32_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.volatile.shared.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i32, ptr addrspace(3) %a monotonic, align 4
   %a.add = add i32 %a.load, 1
@@ -2502,10 +2502,10 @@ define void @shared_monotonic_volatile_sys_i64(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i64_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_volatile_sys_i64_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.volatile.shared.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i64, ptr addrspace(3) %a monotonic, align 8
   %a.add = add i64 %a.load, 1
@@ -2520,10 +2520,10 @@ define void @shared_monotonic_volatile_sys_float(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_monotonic_volatile_sys_float_param_0];
-; CHECK-NEXT:    ld.volatile.shared.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_volatile_sys_float_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.shared.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile float, ptr addrspace(3) %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -2538,10 +2538,10 @@ define void @shared_monotonic_volatile_sys_double(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_monotonic_volatile_sys_double_param_0];
-; CHECK-NEXT:    ld.volatile.shared.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_monotonic_volatile_sys_double_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.shared.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.volatile.shared.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile double, ptr addrspace(3) %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -2560,10 +2560,10 @@ define void @local_i8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_i8_param_0];
-; CHECK-NEXT:    ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_i8_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load i8, ptr addrspace(5) %a
   %a.add = add i8 %a.load, 1
@@ -2578,10 +2578,10 @@ define void @local_i16(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_i16_param_0];
-; CHECK-NEXT:    ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_i16_param_0];
+; CHECK-NEXT:    ld.local.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.local.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load i16, ptr addrspace(5) %a
   %a.add = add i16 %a.load, 1
@@ -2596,10 +2596,10 @@ define void @local_i32(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_i32_param_0];
-; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_i32_param_0];
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.local.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load i32, ptr addrspace(5) %a
   %a.add = add i32 %a.load, 1
@@ -2613,10 +2613,10 @@ define void @local_i64(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_i64_param_0];
-; CHECK-NEXT:    ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_i64_param_0];
+; CHECK-NEXT:    ld.local.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.local.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load i64, ptr addrspace(5) %a
   %a.add = add i64 %a.load, 1
@@ -2631,10 +2631,10 @@ define void @local_float(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_float_param_0];
-; CHECK-NEXT:    ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_float_param_0];
+; CHECK-NEXT:    ld.local.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load float, ptr addrspace(5) %a
   %a.add = fadd float %a.load, 1.
@@ -2649,10 +2649,10 @@ define void @local_double(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_double_param_0];
-; CHECK-NEXT:    ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_double_param_0];
+; CHECK-NEXT:    ld.local.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.local.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load double, ptr addrspace(5) %a
   %a.add = fadd double %a.load, 1.
@@ -2669,10 +2669,10 @@ define void @local_volatile_i8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_i8_param_0];
-; CHECK-NEXT:    ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_i8_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i8, ptr addrspace(5) %a
   %a.add = add i8 %a.load, 1
@@ -2687,10 +2687,10 @@ define void @local_volatile_i16(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_i16_param_0];
-; CHECK-NEXT:    ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_i16_param_0];
+; CHECK-NEXT:    ld.local.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.local.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i16, ptr addrspace(5) %a
   %a.add = add i16 %a.load, 1
@@ -2705,10 +2705,10 @@ define void @local_volatile_i32(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_i32_param_0];
-; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_i32_param_0];
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.local.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i32, ptr addrspace(5) %a
   %a.add = add i32 %a.load, 1
@@ -2722,10 +2722,10 @@ define void @local_volatile_i64(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_i64_param_0];
-; CHECK-NEXT:    ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_i64_param_0];
+; CHECK-NEXT:    ld.local.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.local.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile i64, ptr addrspace(5) %a
   %a.add = add i64 %a.load, 1
@@ -2740,10 +2740,10 @@ define void @local_volatile_float(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_float_param_0];
-; CHECK-NEXT:    ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_float_param_0];
+; CHECK-NEXT:    ld.local.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile float, ptr addrspace(5) %a
   %a.add = fadd float %a.load, 1.
@@ -2758,10 +2758,10 @@ define void @local_volatile_double(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_double_param_0];
-; CHECK-NEXT:    ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_double_param_0];
+; CHECK-NEXT:    ld.local.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.local.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile double, ptr addrspace(5) %a
   %a.add = fadd double %a.load, 1.
@@ -2778,10 +2778,10 @@ define void @local_unordered_sys_i8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_sys_i8_param_0];
-; CHECK-NEXT:    ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_sys_i8_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
   %a.add = add i8 %a.load, 1
@@ -2796,10 +2796,10 @@ define void @local_unordered_sys_i16(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_sys_i16_param_0];
-; CHECK-NEXT:    ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_sys_i16_param_0];
+; CHECK-NEXT:    ld.local.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.local.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic i16, ptr addrspace(5) %a unordered, align 2
   %a.add = add i16 %a.load, 1
@@ -2814,10 +2814,10 @@ define void @local_unordered_sys_i32(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_sys_i32_param_0];
-; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_sys_i32_param_0];
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.local.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic i32, ptr addrspace(5) %a unordered, align 4
   %a.add = add i32 %a.load, 1
@@ -2831,10 +2831,10 @@ define void @local_unordered_sys_i64(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_sys_i64_param_0];
-; CHECK-NEXT:    ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_sys_i64_param_0];
+; CHECK-NEXT:    ld.local.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.local.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic i64, ptr addrspace(5) %a unordered, align 8
   %a.add = add i64 %a.load, 1
@@ -2849,10 +2849,10 @@ define void @local_unordered_sys_float(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_sys_float_param_0];
-; CHECK-NEXT:    ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_sys_float_param_0];
+; CHECK-NEXT:    ld.local.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic float, ptr addrspace(5) %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -2867,10 +2867,10 @@ define void @local_unordered_sys_double(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_sys_double_param_0];
-; CHECK-NEXT:    ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_sys_double_param_0];
+; CHECK-NEXT:    ld.local.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.local.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic double, ptr addrspace(5) %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -2887,10 +2887,10 @@ define void @local_unordered_volatile_sys_i8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_volatile_sys_i8_param_0];
-; CHECK-NEXT:    ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_volatile_sys_i8_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
   %a.add = add i8 %a.load, 1
@@ -2905,10 +2905,10 @@ define void @local_unordered_volatile_sys_i16(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_volatile_sys_i16_param_0];
-; CHECK-NEXT:    ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_volatile_sys_i16_param_0];
+; CHECK-NEXT:    ld.local.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.local.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i16, ptr addrspace(5) %a unordered, align 2
   %a.add = add i16 %a.load, 1
@@ -2923,10 +2923,10 @@ define void @local_unordered_volatile_sys_i32(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_volatile_sys_i32_param_0];
-; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_volatile_sys_i32_param_0];
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.local.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i32, ptr addrspace(5) %a unordered, align 4
   %a.add = add i32 %a.load, 1
@@ -2940,10 +2940,10 @@ define void @local_unordered_volatile_sys_i64(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_volatile_sys_i64_param_0];
-; CHECK-NEXT:    ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_volatile_sys_i64_param_0];
+; CHECK-NEXT:    ld.local.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.local.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i64, ptr addrspace(5) %a unordered, align 8
   %a.add = add i64 %a.load, 1
@@ -2958,10 +2958,10 @@ define void @local_unordered_volatile_sys_float(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_volatile_sys_float_param_0];
-; CHECK-NEXT:    ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_volatile_sys_float_param_0];
+; CHECK-NEXT:    ld.local.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile float, ptr addrspace(5) %a unordered, align 4
   %a.add = fadd float %a.load, 1.
@@ -2976,10 +2976,10 @@ define void @local_unordered_volatile_sys_double(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_volatile_sys_double_param_0];
-; CHECK-NEXT:    ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_unordered_volatile_sys_double_param_0];
+; CHECK-NEXT:    ld.local.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.local.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile double, ptr addrspace(5) %a unordered, align 8
   %a.add = fadd double %a.load, 1.
@@ -2996,10 +2996,10 @@ define void @local_monotonic_sys_i8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_sys_i8_param_0];
-; CHECK-NEXT:    ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_sys_i8_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
   %a.add = add i8 %a.load, 1
@@ -3014,10 +3014,10 @@ define void @local_monotonic_sys_i16(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_sys_i16_param_0];
-; CHECK-NEXT:    ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_sys_i16_param_0];
+; CHECK-NEXT:    ld.local.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.local.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic i16, ptr addrspace(5) %a monotonic, align 2
   %a.add = add i16 %a.load, 1
@@ -3032,10 +3032,10 @@ define void @local_monotonic_sys_i32(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_sys_i32_param_0];
-; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_sys_i32_param_0];
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.local.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic i32, ptr addrspace(5) %a monotonic, align 4
   %a.add = add i32 %a.load, 1
@@ -3049,10 +3049,10 @@ define void @local_monotonic_sys_i64(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_sys_i64_param_0];
-; CHECK-NEXT:    ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_sys_i64_param_0];
+; CHECK-NEXT:    ld.local.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.local.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic i64, ptr addrspace(5) %a monotonic, align 8
   %a.add = add i64 %a.load, 1
@@ -3067,10 +3067,10 @@ define void @local_monotonic_sys_float(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_sys_float_param_0];
-; CHECK-NEXT:    ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_sys_float_param_0];
+; CHECK-NEXT:    ld.local.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic float, ptr addrspace(5) %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -3085,10 +3085,10 @@ define void @local_monotonic_sys_double(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_sys_double_param_0];
-; CHECK-NEXT:    ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_sys_double_param_0];
+; CHECK-NEXT:    ld.local.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.local.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic double, ptr addrspace(5) %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
@@ -3105,10 +3105,10 @@ define void @local_monotonic_volatile_sys_i8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_volatile_sys_i8_param_0];
-; CHECK-NEXT:    ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_volatile_sys_i8_param_0];
+; CHECK-NEXT:    ld.local.b8 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT:    st.local.b8 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
   %a.add = add i8 %a.load, 1
@@ -3123,10 +3123,10 @@ define void @local_monotonic_volatile_sys_i16(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_volatile_sys_i16_param_0];
-; CHECK-NEXT:    ld.local.u16 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_volatile_sys_i16_param_0];
+; CHECK-NEXT:    ld.local.b16 %rs1, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT:    st.local.u16 [%rd1], %rs2;
+; CHECK-NEXT:    st.local.b16 [%rd1], %rs2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i16, ptr addrspace(5) %a monotonic, align 2
   %a.add = add i16 %a.load, 1
@@ -3141,10 +3141,10 @@ define void @local_monotonic_volatile_sys_i32(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_volatile_sys_i32_param_0];
-; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_volatile_sys_i32_param_0];
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
 ; CHECK-NEXT:    add.s32 %r2, %r1, 1;
-; CHECK-NEXT:    st.local.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i32, ptr addrspace(5) %a monotonic, align 4
   %a.add = add i32 %a.load, 1
@@ -3158,10 +3158,10 @@ define void @local_monotonic_volatile_sys_i64(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_volatile_sys_i64_param_0];
-; CHECK-NEXT:    ld.local.u64 %rd2, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_volatile_sys_i64_param_0];
+; CHECK-NEXT:    ld.local.b64 %rd2, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT:    st.local.u64 [%rd1], %rd3;
+; CHECK-NEXT:    st.local.b64 [%rd1], %rd3;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i64, ptr addrspace(5) %a monotonic, align 8
   %a.add = add i64 %a.load, 1
@@ -3176,10 +3176,10 @@ define void @local_monotonic_volatile_sys_float(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_volatile_sys_float_param_0];
-; CHECK-NEXT:    ld.local.f32 %f1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_volatile_sys_float_param_0];
+; CHECK-NEXT:    ld.local.b32 %f1, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.f32 [%rd1], %f2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %f2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile float, ptr addrspace(5) %a monotonic, align 4
   %a.add = fadd float %a.load, 1.
@@ -3194,10 +3194,10 @@ define void @local_monotonic_volatile_sys_double(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_volatile_sys_double_param_0];
-; CHECK-NEXT:    ld.local.f64 %fd1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_monotonic_volatile_sys_double_param_0];
+; CHECK-NEXT:    ld.local.b64 %fd1, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.f64 [%rd1], %fd2;
+; CHECK-NEXT:    st.local.b64 [%rd1], %fd2;
 ; CHECK-NEXT:    ret;
   %a.load = load atomic volatile double, ptr addrspace(5) %a monotonic, align 8
   %a.add = fadd double %a.load, 1.
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
index 74554dfcd679a..f967fd1381be5 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
@@ -34,40 +34,40 @@
 
 ; CHECK-LABEL: generic_unordered_gpu
 define void @generic_unordered_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("device") unordered, align 8
 
   ret void
@@ -75,40 +75,40 @@ define void @generic_unordered_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local
 
 ; CHECK-LABEL: generic_unordered_volatile_gpu
 define void @generic_unordered_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("device") unordered, align 8
 
   ret void
@@ -116,40 +116,40 @@ define void @generic_unordered_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr
 
 ; CHECK-LABEL: generic_unordered_cta
 define void @generic_unordered_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("block") unordered, align 8
 
   ret void
@@ -157,40 +157,40 @@ define void @generic_unordered_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local
 
 ; CHECK-LABEL: generic_unordered_volatile_cta
 define void @generic_unordered_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("block") unordered, align 8
 
   ret void
@@ -198,40 +198,40 @@ define void @generic_unordered_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr
 
 ; CHECK-LABEL: generic_monotonic_gpu
 define void @generic_monotonic_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("device") monotonic, align 8
 
   ret void
@@ -239,40 +239,40 @@ define void @generic_monotonic_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local
 
 ; CHECK-LABEL: generic_monotonic_volatile_gpu
 define void @generic_monotonic_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("device") monotonic, align 8
 
   ret void
@@ -280,40 +280,40 @@ define void @generic_monotonic_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr
 
 ; CHECK-LABEL: generic_monotonic_cta
 define void @generic_monotonic_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("block") monotonic, align 8
 
   ret void
@@ -321,40 +321,40 @@ define void @generic_monotonic_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local
 
 ; CHECK-LABEL: generic_monotonic_volatile_cta
 define void @generic_monotonic_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("block") monotonic, align 8
 
   ret void
@@ -362,40 +362,40 @@ define void @generic_monotonic_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr
 
 ; CHECK-LABEL: generic_acq_rel_sys
 define void @generic_acq_rel_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a release, align 1
 
-  ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b release, align 2
 
-  ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c release, align 4
 
-  ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d release, align 8
 
-  ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e release, align 4
 
-  ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e release, align 8
 
   ret void
@@ -403,40 +403,40 @@ define void @generic_acq_rel_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u
 
 ; CHECK-LABEL: generic_acq_rel_volatile_sys
 define void @generic_acq_rel_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a release, align 1
 
-  ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b release, align 2
 
-  ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c release, align 4
 
-  ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d release, align 8
 
-  ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e release, align 4
 
-  ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e release, align 8
 
   ret void
@@ -444,40 +444,40 @@ define void @generic_acq_rel_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e
 
 ; CHECK-LABEL: generic_acq_rel_gpu
 define void @generic_acq_rel_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.gpu.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("device") release, align 1
 
-  ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.gpu.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("device") release, align 2
 
-  ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.gpu.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.gpu.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("device") release, align 8
 
-  ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.gpu.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.gpu.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("device") release, align 8
 
   ret void
@@ -485,40 +485,40 @@ define void @generic_acq_rel_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u
 
 ; CHECK-LABEL: generic_acq_rel_volatile_gpu
 define void @generic_acq_rel_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("device") release, align 1
 
-  ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("device") release, align 2
 
-  ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("device") release, align 8
 
-  ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("device") release, align 8
 
   ret void
@@ -526,40 +526,40 @@ define void @generic_acq_rel_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e
 
 ; CHECK-LABEL: generic_acq_rel_cta
 define void @generic_acq_rel_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cta.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("block") release, align 1
 
-  ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cta.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("block") release, align 2
 
-  ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.cta.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.cta.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("block") release, align 8
 
-  ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.cta.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.cta.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("block") release, align 8
 
   ret void
@@ -567,40 +567,40 @@ define void @generic_acq_rel_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u
 
 ; CHECK-LABEL: generic_acq_rel_volatile_cta
 define void @generic_acq_rel_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("block") release, align 1
 
-  ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("block") release, align 2
 
-  ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("block") release, align 8
 
-  ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("block") release, align 8
 
   ret void
@@ -609,51 +609,51 @@ define void @generic_acq_rel_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e
 ; CHECK-LABEL: generic_sc_sys
 define void @generic_sc_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e seq_cst, align 8
 
   ret void
@@ -662,51 +662,51 @@ define void @generic_sc_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unname
 ; CHECK-LABEL: generic_sc_volatile_sys
 define void @generic_sc_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e seq_cst, align 8
 
   ret void
@@ -715,51 +715,51 @@ define void @generic_sc_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) loc
 ; CHECK-LABEL: generic_sc_gpu
 define void @generic_sc_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.gpu.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("device") seq_cst, align 1
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.gpu.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("device") seq_cst, align 2
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.gpu.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("device") seq_cst, align 4
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.gpu.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("device") seq_cst, align 8
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.gpu.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("device") seq_cst, align 4
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.gpu.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -768,51 +768,51 @@ define void @generic_sc_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unname
 ; CHECK-LABEL: generic_sc_volatile_gpu
 define void @generic_sc_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]  
+  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]  
   %a.load = load atomic volatile i8, ptr %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("device") seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("device") seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("device") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("device") seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("device") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -821,51 +821,51 @@ define void @generic_sc_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) loc
 ; CHECK-LABEL: generic_sc_cta
 define void @generic_sc_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cta.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("block") seq_cst, align 1
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cta.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("block") seq_cst, align 2
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.cta.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("block") seq_cst, align 4
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.cta.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("block") seq_cst, align 8
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.cta.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("block") seq_cst, align 4
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.cta.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -874,51 +874,51 @@ define void @generic_sc_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unname
 ; CHECK-LABEL: generic_sc_volatile_cta
 define void @generic_sc_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("block") seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("block") seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("block") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("block") seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("block") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -928,40 +928,40 @@ define void @generic_sc_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) loc
 
 ; CHECK-LABEL: global_unordered_gpu
 define void @global_unordered_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8
 
   ret void
@@ -969,40 +969,40 @@ define void @global_unordered_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr
 
 ; CHECK-LABEL: global_unordered_volatile_gpu
 define void @global_unordered_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8
 
   ret void
@@ -1010,40 +1010,40 @@ define void @global_unordered_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1)
 
 ; CHECK-LABEL: global_unordered_cta
 define void @global_unordered_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8
 
   ret void
@@ -1051,40 +1051,40 @@ define void @global_unordered_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr
 
 ; CHECK-LABEL: global_unordered_volatile_cta
 define void @global_unordered_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8
 
   ret void
@@ -1092,40 +1092,40 @@ define void @global_unordered_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1)
 
 ; CHECK-LABEL: global_monotonic_gpu
 define void @global_monotonic_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8
 
   ret void
@@ -1133,40 +1133,40 @@ define void @global_monotonic_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr
 
 ; CHECK-LABEL: global_monotonic_volatile_gpu
 define void @global_monotonic_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8
 
   ret void
@@ -1174,40 +1174,40 @@ define void @global_monotonic_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1)
 
 ; CHECK-LABEL: global_monotonic_cta
 define void @global_monotonic_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8
 
   ret void
@@ -1215,40 +1215,40 @@ define void @global_monotonic_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr
 
 ; CHECK-LABEL: global_monotonic_volatile_cta
 define void @global_monotonic_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.mmio.relaxed.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8
 
   ret void
@@ -1256,40 +1256,40 @@ define void @global_monotonic_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1)
 
 ; CHECK-LABEL: global_acq_rel_sys
 define void @global_acq_rel_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a release, align 1
 
-  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b release, align 2
 
-  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c release, align 4
 
-  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d release, align 8
 
-  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e release, align 4
 
-  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e release, align 8
 
   ret void
@@ -1297,40 +1297,40 @@ define void @global_acq_rel_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad
 
 ; CHECK-LABEL: global_acq_rel_volatile_sys
 define void @global_acq_rel_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1
 
-  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2
 
-  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4
 
-  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8
 
-  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4
 
-  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8
 
   ret void
@@ -1338,40 +1338,40 @@ define void @global_acq_rel_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %
 
 ; CHECK-LABEL: global_acq_rel_gpu
 define void @global_acq_rel_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.gpu.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1
 
-  ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.gpu.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2
 
-  ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.gpu.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.gpu.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8
 
-  ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.gpu.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.gpu.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8
 
   ret void
@@ -1379,40 +1379,40 @@ define void @global_acq_rel_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad
 
 ; CHECK-LABEL: global_acq_rel_volatile_gpu
 define void @global_acq_rel_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1
 
-  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2
 
-  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8
 
-  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8
 
   ret void
@@ -1420,40 +1420,40 @@ define void @global_acq_rel_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %
 
 ; CHECK-LABEL: global_acq_rel_cta
 define void @global_acq_rel_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cta.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1
 
-  ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cta.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2
 
-  ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.cta.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.cta.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8
 
-  ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.cta.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.cta.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8
 
   ret void
@@ -1461,40 +1461,40 @@ define void @global_acq_rel_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad
 
 ; CHECK-LABEL: global_acq_rel_volatile_cta
 define void @global_acq_rel_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1
 
-  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2
 
-  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8
 
-  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8
 
   ret void
@@ -1503,51 +1503,51 @@ define void @global_acq_rel_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %
 ; CHECK-LABEL: global_seq_cst_sys
 define void @global_seq_cst_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8
 
   ret void
@@ -1556,51 +1556,51 @@ define void @global_seq_cst_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad
 ; CHECK-LABEL: global_seq_cst_volatile_sys
 define void @global_seq_cst_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8
 
   ret void
@@ -1609,51 +1609,51 @@ define void @global_seq_cst_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %
 ; CHECK-LABEL: global_seq_cst_gpu
 define void @global_seq_cst_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.gpu.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.gpu.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.gpu.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.gpu.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.gpu.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.gpu.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -1662,51 +1662,51 @@ define void @global_seq_cst_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad
 ; CHECK-LABEL: global_seq_cst_volatile_gpu
 define void @global_seq_cst_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -1715,51 +1715,51 @@ define void @global_seq_cst_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %
 ; CHECK-LABEL: global_seq_cst_cta
 define void @global_seq_cst_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cta.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cta.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.cta.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.cta.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.cta.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.cta.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -1768,51 +1768,51 @@ define void @global_seq_cst_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad
 ; CHECK-LABEL: global_seq_cst_volatile_cta
 define void @global_seq_cst_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -1822,40 +1822,40 @@ define void @global_seq_cst_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %
 
 ; CHECK-LABEL: shared_unordered_gpu
 define void @shared_unordered_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8
 
   ret void
@@ -1863,40 +1863,40 @@ define void @shared_unordered_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr
 
 ; CHECK-LABEL: shared_unordered_volatile_gpu
 define void @shared_unordered_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8
 
   ret void
@@ -1904,40 +1904,40 @@ define void @shared_unordered_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3)
 
 ; CHECK-LABEL: shared_unordered_cta
 define void @shared_unordered_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8
 
   ret void
@@ -1945,40 +1945,40 @@ define void @shared_unordered_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr
 
 ; CHECK-LABEL: shared_unordered_volatile_cta
 define void @shared_unordered_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8
 
   ret void
@@ -1986,40 +1986,40 @@ define void @shared_unordered_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3)
 
 ; CHECK-LABEL: shared_monotonic_gpu
 define void @shared_monotonic_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.gpu.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.gpu.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8
 
   ret void
@@ -2027,40 +2027,40 @@ define void @shared_monotonic_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr
 
 ; CHECK-LABEL: shared_monotonic_volatile_gpu
 define void @shared_monotonic_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8
 
   ret void
@@ -2068,40 +2068,40 @@ define void @shared_monotonic_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3)
 
 ; CHECK-LABEL: shared_monotonic_cta
 define void @shared_monotonic_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cta.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.cta.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8
 
   ret void
@@ -2109,40 +2109,40 @@ define void @shared_monotonic_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr
 
 ; CHECK-LABEL: shared_monotonic_volatile_cta
 define void @shared_monotonic_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8
 
   ret void
@@ -2150,40 +2150,40 @@ define void @shared_monotonic_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3)
 
 ; CHECK-LABEL: shared_acq_rel_sys
 define void @shared_acq_rel_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a release, align 1
 
-  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b release, align 2
 
-  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d release, align 8
 
-  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e release, align 8
 
   ret void
@@ -2191,40 +2191,40 @@ define void @shared_acq_rel_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad
 
 ; CHECK-LABEL: shared_acq_rel_volatile_sys
 define void @shared_acq_rel_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1
 
-  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2
 
-  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8
 
-  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8
 
   ret void
@@ -2232,40 +2232,40 @@ define void @shared_acq_rel_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %
 
 ; CHECK-LABEL: shared_acq_rel_gpu
 define void @shared_acq_rel_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.gpu.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1
 
-  ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.gpu.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2
 
-  ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.gpu.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.gpu.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8
 
-  ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.gpu.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.gpu.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8
 
   ret void
@@ -2273,40 +2273,40 @@ define void @shared_acq_rel_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad
 
 ; CHECK-LABEL: shared_acq_rel_volatile_gpu
 define void @shared_acq_rel_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1
 
-  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2
 
-  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8
 
-  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8
 
   ret void
@@ -2314,40 +2314,40 @@ define void @shared_acq_rel_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %
 
 ; CHECK-LABEL: shared_acq_rel_cta
 define void @shared_acq_rel_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cta.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1
 
-  ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cta.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2
 
-  ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.cta.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.cta.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8
 
-  ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.cta.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.cta.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8
 
   ret void
@@ -2355,40 +2355,40 @@ define void @shared_acq_rel_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad
 
 ; CHECK-LABEL: shared_acq_rel_volatile_cta
 define void @shared_acq_rel_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1
 
-  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2
 
-  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8
 
-  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8
 
   ret void
@@ -2397,51 +2397,51 @@ define void @shared_acq_rel_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %
 ; CHECK-LABEL: shared_seq_cst_sys
 define void @shared_seq_cst_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8
 
   ret void
@@ -2450,51 +2450,51 @@ define void @shared_seq_cst_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad
 ; CHECK-LABEL: shared_seq_cst_volatile_sys
 define void @shared_seq_cst_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8
 
   ret void
@@ -2503,51 +2503,51 @@ define void @shared_seq_cst_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %
 ; CHECK-LABEL: shared_seq_cst_gpu
 define void @shared_seq_cst_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.gpu.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.gpu.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.gpu.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.gpu.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.gpu.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
 
   ; CHECK: fence.sc.gpu
-  ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.gpu.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.gpu
-  ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.gpu.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -2556,51 +2556,51 @@ define void @shared_seq_cst_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad
 ; CHECK-LABEL: shared_seq_cst_volatile_gpu
 define void @shared_seq_cst_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -2609,51 +2609,51 @@ define void @shared_seq_cst_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %
 ; CHECK-LABEL: shared_seq_cst_cta
 define void @shared_seq_cst_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cta.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cta.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.cta.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.cta.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.cta.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
 
   ; CHECK: fence.sc.cta
-  ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cta.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.cta
-  ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.cta.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -2662,51 +2662,51 @@ define void @shared_seq_cst_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad
 ; CHECK-LABEL: shared_seq_cst_volatile_cta
 define void @shared_seq_cst_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -2716,40 +2716,40 @@ define void @shared_seq_cst_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %
 
 ; CHECK-LABEL: local_unordered_gpu
 define void @local_unordered_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8
 
   ret void
@@ -2757,40 +2757,40 @@ define void @local_unordered_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr a
 
 ; CHECK-LABEL: local_unordered_volatile_gpu
 define void @local_unordered_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8
 
   ret void
@@ -2798,40 +2798,40 @@ define void @local_unordered_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5)
 
 ; CHECK-LABEL: local_unordered_cta
 define void @local_unordered_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8
 
   ret void
@@ -2839,40 +2839,40 @@ define void @local_unordered_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr a
 
 ; CHECK-LABEL: local_unordered_volatile_cta
 define void @local_unordered_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8
 
   ret void
@@ -2880,40 +2880,40 @@ define void @local_unordered_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5)
 
 ; CHECK-LABEL: local_monotonic_gpu
 define void @local_monotonic_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8
 
   ret void
@@ -2921,40 +2921,40 @@ define void @local_monotonic_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr a
 
 ; CHECK-LABEL: local_monotonic_volatile_gpu
 define void @local_monotonic_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8
 
   ret void
@@ -2962,40 +2962,40 @@ define void @local_monotonic_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5)
 
 ; CHECK-LABEL: local_monotonic_cta
 define void @local_monotonic_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8
 
   ret void
@@ -3003,40 +3003,40 @@ define void @local_monotonic_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr a
 
 ; CHECK-LABEL: local_monotonic_volatile_cta
 define void @local_monotonic_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8
 
   ret void
@@ -3044,40 +3044,40 @@ define void @local_monotonic_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5)
 
 ; CHECK-LABEL: local_acq_rel_sys
 define void @local_acq_rel_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a release, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b release, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c release, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d release, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e release, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e release, align 8
 
   ret void
@@ -3085,40 +3085,40 @@ define void @local_acq_rel_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add
 
 ; CHECK-LABEL: local_acq_rel_volatile_sys
 define void @local_acq_rel_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8
 
   ret void
@@ -3126,40 +3126,40 @@ define void @local_acq_rel_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b
 
 ; CHECK-LABEL: local_acq_rel_gpu
 define void @local_acq_rel_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8
 
   ret void
@@ -3167,40 +3167,40 @@ define void @local_acq_rel_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add
 
 ; CHECK-LABEL: local_acq_rel_volatile_gpu
 define void @local_acq_rel_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8
 
   ret void
@@ -3208,40 +3208,40 @@ define void @local_acq_rel_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b
 
 ; CHECK-LABEL: local_acq_rel_cta
 define void @local_acq_rel_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8
 
   ret void
@@ -3249,40 +3249,40 @@ define void @local_acq_rel_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add
 
 ; CHECK-LABEL: local_acq_rel_volatile_cta
 define void @local_acq_rel_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8
 
   ret void
@@ -3290,40 +3290,40 @@ define void @local_acq_rel_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b
 
 ; CHECK-LABEL: local_seq_cst_sys
 define void @local_seq_cst_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a seq_cst, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b seq_cst, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c seq_cst, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d seq_cst, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e seq_cst, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e seq_cst, align 8
 
   ret void
@@ -3331,40 +3331,40 @@ define void @local_seq_cst_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add
 
 ; CHECK-LABEL: local_seq_cst_volatile_sys
 define void @local_seq_cst_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a seq_cst, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b seq_cst, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c seq_cst, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d seq_cst, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e seq_cst, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e seq_cst, align 8
 
   ret void
@@ -3372,40 +3372,40 @@ define void @local_seq_cst_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b
 
 ; CHECK-LABEL: local_seq_cst_gpu
 define void @local_seq_cst_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -3413,40 +3413,40 @@ define void @local_seq_cst_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add
 
 ; CHECK-LABEL: local_seq_cst_volatile_gpu
 define void @local_seq_cst_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
 
   ret void
@@ -3454,40 +3454,40 @@ define void @local_seq_cst_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b
 
 ; CHECK-LABEL: local_seq_cst_cta
 define void @local_seq_cst_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
 
   ret void
@@ -3495,40 +3495,40 @@ define void @local_seq_cst_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add
 
 ; CHECK-LABEL: local_seq_cst_volatile_cta
 define void @local_seq_cst_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
 
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
index 345b55eb65bda..ae559f50d4987 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
@@ -34,40 +34,40 @@
 
 ; CHECK-LABEL: generic_unordered_cluster
 define void @generic_unordered_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -75,40 +75,40 @@ define void @generic_unordered_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) l
 
 ; CHECK-LABEL: generic_unordered_volatile_cluster
 define void @generic_unordered_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -116,40 +116,40 @@ define void @generic_unordered_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d,
 
 ; CHECK-LABEL: generic_monotonic_cluster
 define void @generic_monotonic_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -157,40 +157,40 @@ define void @generic_monotonic_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) l
 
 ; CHECK-LABEL: generic_monotonic_volatile_cluster
 define void @generic_monotonic_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -198,40 +198,40 @@ define void @generic_monotonic_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d,
 
 ; CHECK-LABEL: generic_acq_rel_cluster
 define void @generic_acq_rel_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cluster.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cluster.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.cluster.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.cluster.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.cluster.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.cluster.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("cluster") release, align 8
 
   ret void
@@ -239,40 +239,40 @@ define void @generic_acq_rel_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) loc
 
 ; CHECK-LABEL: generic_acq_rel_volatile_cluster
 define void @generic_acq_rel_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("cluster") release, align 8
 
   ret void
@@ -281,51 +281,51 @@ define void @generic_acq_rel_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, pt
 ; CHECK-LABEL: generic_sc_cluster
 define void @generic_sc_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cluster.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cluster.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.cluster.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.cluster.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.cluster.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e syncscope("cluster") seq_cst, align 4
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.cluster.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -334,51 +334,51 @@ define void @generic_sc_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_un
 ; CHECK-LABEL: generic_sc_volatile_cluster
 define void @generic_sc_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e syncscope("cluster") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -388,40 +388,40 @@ define void @generic_sc_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e)
 
 ; CHECK-LABEL: global_unordered_cluster
 define void @global_unordered_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -429,40 +429,40 @@ define void @global_unordered_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b,
 
 ; CHECK-LABEL: global_unordered_volatile_cluster
 define void @global_unordered_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -470,40 +470,40 @@ define void @global_unordered_volatile_cluster(ptr addrspace(1) %a, ptr addrspac
 
 ; CHECK-LABEL: global_monotonic_cluster
 define void @global_monotonic_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -511,40 +511,40 @@ define void @global_monotonic_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b,
 
 ; CHECK-LABEL: global_monotonic_volatile_cluster
 define void @global_monotonic_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -552,40 +552,40 @@ define void @global_monotonic_volatile_cluster(ptr addrspace(1) %a, ptr addrspac
 
 ; CHECK-LABEL: global_acq_rel_cluster
 define void @global_acq_rel_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cluster.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cluster.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.cluster.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.cluster.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.cluster.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.cluster.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8
 
   ret void
@@ -593,40 +593,40 @@ define void @global_acq_rel_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, pt
 
 ; CHECK-LABEL: global_acq_rel_volatile_cluster
 define void @global_acq_rel_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8
 
   ret void
@@ -635,51 +635,51 @@ define void @global_acq_rel_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(
 ; CHECK-LABEL: global_seq_cst_cluster
 define void @global_seq_cst_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cluster.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cluster.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.cluster.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.cluster.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.cluster.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.cluster.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -688,51 +688,51 @@ define void @global_seq_cst_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, pt
 ; CHECK-LABEL: global_seq_cst_volatile_cluster
 define void @global_seq_cst_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -742,40 +742,40 @@ define void @global_seq_cst_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(
 
 ; CHECK-LABEL: shared_unordered_cluster
 define void @shared_unordered_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -783,40 +783,40 @@ define void @shared_unordered_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b,
 
 ; CHECK-LABEL: shared_unordered_volatile_cluster
 define void @shared_unordered_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -824,40 +824,40 @@ define void @shared_unordered_volatile_cluster(ptr addrspace(3) %a, ptr addrspac
 
 ; CHECK-LABEL: shared_monotonic_cluster
 define void @shared_monotonic_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.relaxed.cluster.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.relaxed.cluster.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -865,40 +865,40 @@ define void @shared_monotonic_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b,
 
 ; CHECK-LABEL: shared_monotonic_volatile_cluster
 define void @shared_monotonic_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -906,40 +906,40 @@ define void @shared_monotonic_volatile_cluster(ptr addrspace(3) %a, ptr addrspac
 
 ; CHECK-LABEL: shared_acq_rel_cluster
 define void @shared_acq_rel_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cluster.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cluster.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.cluster.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.cluster.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.cluster.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.cluster.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8
 
   ret void
@@ -947,40 +947,40 @@ define void @shared_acq_rel_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, pt
 
 ; CHECK-LABEL: shared_acq_rel_volatile_cluster
 define void @shared_acq_rel_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8
 
   ret void
@@ -989,51 +989,51 @@ define void @shared_acq_rel_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(
 ; CHECK-LABEL: shared_seq_cst_cluster
 define void @shared_seq_cst_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cluster.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.cluster.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.cluster.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.cluster.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.cluster.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
 
   ; CHECK: fence.sc.cluster
-  ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.cluster.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.cluster
-  ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.cluster.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -1042,51 +1042,51 @@ define void @shared_seq_cst_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, pt
 ; CHECK-LABEL: shared_seq_cst_volatile_cluster
 define void @shared_seq_cst_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
 
   ; CHECK: fence.sc.sys
-  ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
   ; CHECK: fence.sc.sys
-  ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -1096,40 +1096,40 @@ define void @shared_seq_cst_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(
 
 ; CHECK-LABEL: local_unordered_cluster
 define void @local_unordered_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -1137,40 +1137,40 @@ define void @local_unordered_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, p
 
 ; CHECK-LABEL: local_unordered_volatile_cluster
 define void @local_unordered_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
 
   ret void
@@ -1178,40 +1178,40 @@ define void @local_unordered_volatile_cluster(ptr addrspace(5) %a, ptr addrspace
 
 ; CHECK-LABEL: local_monotonic_cluster
 define void @local_monotonic_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -1219,40 +1219,40 @@ define void @local_monotonic_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, p
 
 ; CHECK-LABEL: local_monotonic_volatile_cluster
 define void @local_monotonic_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
 
   ret void
@@ -1260,40 +1260,40 @@ define void @local_monotonic_volatile_cluster(ptr addrspace(5) %a, ptr addrspace
 
 ; CHECK-LABEL: local_acq_rel_cluster
 define void @local_acq_rel_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8
 
   ret void
@@ -1301,40 +1301,40 @@ define void @local_acq_rel_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr
 
 ; CHECK-LABEL: local_acq_rel_volatile_cluster
 define void @local_acq_rel_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8
 
   ret void
@@ -1342,40 +1342,40 @@ define void @local_acq_rel_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5
 
 ; CHECK-LABEL: local_seq_cst_cluster
 define void @local_seq_cst_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
 
   ret void
@@ -1383,40 +1383,40 @@ define void @local_seq_cst_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr
 
 ; CHECK-LABEL: local_seq_cst_volatile_cluster
 define void @local_seq_cst_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+  ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+  ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+  ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
 
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
index 3215fce964005..2b5553a77fe98 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
@@ -31,11 +31,11 @@ define void @generic_2xi8(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_2xi8_param_0];
-; CHECK-NEXT:    ld.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xi8_param_0];
+; CHECK-NEXT:    ld.v2.b8 {%rs1, %rs2}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT:    st.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT:    st.v2.b8 [%rd1], {%rs4, %rs3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i8>, ptr %a
   %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
@@ -54,8 +54,8 @@ define void @generic_4xi8(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_4xi8_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_4xi8_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
@@ -75,7 +75,7 @@ define void @generic_4xi8(ptr %a) {
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
-; CHECK-NEXT:    st.u32 [%rd1], %r12;
+; CHECK-NEXT:    st.b32 [%rd1], %r12;
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x i8>, ptr %a
   %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
@@ -91,7 +91,7 @@ define void @generic_8xi8(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_8xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_8xi8_param_0];
 ; CHECK-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
@@ -147,7 +147,7 @@ define void @generic_16xi8(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_16xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_16xi8_param_0];
 ; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
@@ -241,13 +241,13 @@ define void @generic_2xi16(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_2xi16_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xi16_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
 ; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT:    st.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i16>, ptr %a
   %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
@@ -262,13 +262,13 @@ define void @generic_4xi16(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_4xi16_param_0];
-; CHECK-NEXT:    ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_4xi16_param_0];
+; CHECK-NEXT:    ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
 ; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
 ; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
-; CHECK-NEXT:    st.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT:    st.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x i16>, ptr %a
   %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
@@ -284,7 +284,7 @@ define void @generic_8xi16(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_8xi16_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_8xi16_param_0];
 ; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
@@ -317,11 +317,11 @@ define void @generic_2xi32(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_2xi32_param_0];
-; CHECK-NEXT:    ld.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xi32_param_0];
+; CHECK-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r3, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r4, %r1, 1;
-; CHECK-NEXT:    st.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT:    st.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i32>, ptr %a
   %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
@@ -336,13 +336,13 @@ define void @generic_4xi32(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_4xi32_param_0];
-; CHECK-NEXT:    ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_4xi32_param_0];
+; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r5, %r4, 1;
 ; CHECK-NEXT:    add.s32 %r6, %r3, 1;
 ; CHECK-NEXT:    add.s32 %r7, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r8, %r1, 1;
-; CHECK-NEXT:    st.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT:    st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x i32>, ptr %a
   %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
@@ -356,11 +356,11 @@ define void @generic_2xi64(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_2xi64_param_0];
-; CHECK-NEXT:    ld.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xi64_param_0];
+; CHECK-NEXT:    ld.v2.b64 {%rd2, %rd3}, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
 ; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
-; CHECK-NEXT:    st.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT:    st.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i64>, ptr %a
   %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
@@ -375,11 +375,11 @@ define void @generic_2xfloat(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_2xfloat_param_0];
-; CHECK-NEXT:    ld.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xfloat_param_0];
+; CHECK-NEXT:    ld.v2.b32 {%f1, %f2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    st.v2.b32 [%rd1], {%f4, %f3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x float>, ptr %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -394,13 +394,13 @@ define void @generic_4xfloat(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_4xfloat_param_0];
-; CHECK-NEXT:    ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_4xfloat_param_0];
+; CHECK-NEXT:    ld.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    st.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x float>, ptr %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -415,11 +415,11 @@ define void @generic_2xdouble(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_2xdouble_param_0];
-; CHECK-NEXT:    ld.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xdouble_param_0];
+; CHECK-NEXT:    ld.v2.b64 {%fd1, %fd2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
 ; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    st.v2.b64 [%rd1], {%fd4, %fd3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x double>, ptr %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -453,11 +453,11 @@ define void @generic_volatile_2xi8(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_2xi8_param_0];
-; CHECK-NEXT:    ld.volatile.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xi8_param_0];
+; CHECK-NEXT:    ld.volatile.v2.b8 {%rs1, %rs2}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT:    st.volatile.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT:    st.volatile.v2.b8 [%rd1], {%rs4, %rs3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i8>, ptr %a
   %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
@@ -475,8 +475,8 @@ define void @generic_volatile_4xi8(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_4xi8_param_0];
-; CHECK-NEXT:    ld.volatile.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_4xi8_param_0];
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
@@ -496,7 +496,7 @@ define void @generic_volatile_4xi8(ptr %a) {
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
-; CHECK-NEXT:    st.volatile.u32 [%rd1], %r12;
+; CHECK-NEXT:    st.volatile.b32 [%rd1], %r12;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x i8>, ptr %a
   %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
@@ -512,7 +512,7 @@ define void @generic_volatile_8xi8(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_8xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_8xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
@@ -568,7 +568,7 @@ define void @generic_volatile_16xi8(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_16xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_16xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
@@ -662,13 +662,13 @@ define void @generic_volatile_2xi16(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_2xi16_param_0];
-; CHECK-NEXT:    ld.volatile.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xi16_param_0];
+; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
 ; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT:    st.volatile.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.volatile.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i16>, ptr %a
   %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
@@ -683,13 +683,13 @@ define void @generic_volatile_4xi16(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_4xi16_param_0];
-; CHECK-NEXT:    ld.volatile.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_4xi16_param_0];
+; CHECK-NEXT:    ld.volatile.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
 ; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
 ; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
-; CHECK-NEXT:    st.volatile.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT:    st.volatile.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x i16>, ptr %a
   %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
@@ -705,7 +705,7 @@ define void @generic_volatile_8xi16(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_8xi16_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_8xi16_param_0];
 ; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
@@ -738,11 +738,11 @@ define void @generic_volatile_2xi32(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_2xi32_param_0];
-; CHECK-NEXT:    ld.volatile.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xi32_param_0];
+; CHECK-NEXT:    ld.volatile.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r3, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r4, %r1, 1;
-; CHECK-NEXT:    st.volatile.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT:    st.volatile.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i32>, ptr %a
   %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
@@ -757,13 +757,13 @@ define void @generic_volatile_4xi32(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_4xi32_param_0];
-; CHECK-NEXT:    ld.volatile.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_4xi32_param_0];
+; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r5, %r4, 1;
 ; CHECK-NEXT:    add.s32 %r6, %r3, 1;
 ; CHECK-NEXT:    add.s32 %r7, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r8, %r1, 1;
-; CHECK-NEXT:    st.volatile.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT:    st.volatile.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x i32>, ptr %a
   %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
@@ -777,11 +777,11 @@ define void @generic_volatile_2xi64(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_2xi64_param_0];
-; CHECK-NEXT:    ld.volatile.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xi64_param_0];
+; CHECK-NEXT:    ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
 ; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
-; CHECK-NEXT:    st.volatile.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT:    st.volatile.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i64>, ptr %a
   %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
@@ -796,11 +796,11 @@ define void @generic_volatile_2xfloat(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_2xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xfloat_param_0];
+; CHECK-NEXT:    ld.volatile.v2.b32 {%f1, %f2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    st.volatile.v2.b32 [%rd1], {%f4, %f3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x float>, ptr %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -815,13 +815,13 @@ define void @generic_volatile_4xfloat(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_4xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_4xfloat_param_0];
+; CHECK-NEXT:    ld.volatile.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    st.volatile.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x float>, ptr %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -836,11 +836,11 @@ define void @generic_volatile_2xdouble(ptr %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_2xdouble_param_0];
-; CHECK-NEXT:    ld.volatile.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xdouble_param_0];
+; CHECK-NEXT:    ld.volatile.v2.b64 {%fd1, %fd2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
 ; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    st.volatile.v2.b64 [%rd1], {%fd4, %fd3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x double>, ptr %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -859,11 +859,11 @@ define void @global_2xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_2xi8_param_0];
-; CHECK-NEXT:    ld.global.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xi8_param_0];
+; CHECK-NEXT:    ld.global.v2.b8 {%rs1, %rs2}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT:    st.global.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT:    st.global.v2.b8 [%rd1], {%rs4, %rs3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i8>, ptr addrspace(1) %a
   %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
@@ -879,8 +879,8 @@ define void @global_4xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_4xi8_param_0];
-; CHECK-NEXT:    ld.global.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_4xi8_param_0];
+; CHECK-NEXT:    ld.global.b32 %r1, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
@@ -900,7 +900,7 @@ define void @global_4xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
-; CHECK-NEXT:    st.global.u32 [%rd1], %r12;
+; CHECK-NEXT:    st.global.b32 [%rd1], %r12;
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x i8>, ptr addrspace(1) %a
   %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
@@ -916,7 +916,7 @@ define void @global_8xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_8xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_8xi8_param_0];
 ; CHECK-NEXT:    ld.global.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
@@ -972,7 +972,7 @@ define void @global_16xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_16xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_16xi8_param_0];
 ; CHECK-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
@@ -1066,13 +1066,13 @@ define void @global_2xi16(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_2xi16_param_0];
-; CHECK-NEXT:    ld.global.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xi16_param_0];
+; CHECK-NEXT:    ld.global.b32 %r1, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
 ; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT:    st.global.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.global.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i16>, ptr addrspace(1) %a
   %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
@@ -1087,13 +1087,13 @@ define void @global_4xi16(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_4xi16_param_0];
-; CHECK-NEXT:    ld.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_4xi16_param_0];
+; CHECK-NEXT:    ld.global.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
 ; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
 ; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
-; CHECK-NEXT:    st.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT:    st.global.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x i16>, ptr addrspace(1) %a
   %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
@@ -1109,7 +1109,7 @@ define void @global_8xi16(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_8xi16_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_8xi16_param_0];
 ; CHECK-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
@@ -1142,11 +1142,11 @@ define void @global_2xi32(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_2xi32_param_0];
-; CHECK-NEXT:    ld.global.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xi32_param_0];
+; CHECK-NEXT:    ld.global.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r3, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r4, %r1, 1;
-; CHECK-NEXT:    st.global.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT:    st.global.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i32>, ptr addrspace(1) %a
   %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
@@ -1161,13 +1161,13 @@ define void @global_4xi32(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_4xi32_param_0];
-; CHECK-NEXT:    ld.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_4xi32_param_0];
+; CHECK-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r5, %r4, 1;
 ; CHECK-NEXT:    add.s32 %r6, %r3, 1;
 ; CHECK-NEXT:    add.s32 %r7, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r8, %r1, 1;
-; CHECK-NEXT:    st.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT:    st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x i32>, ptr addrspace(1) %a
   %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
@@ -1181,11 +1181,11 @@ define void @global_2xi64(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_2xi64_param_0];
-; CHECK-NEXT:    ld.global.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xi64_param_0];
+; CHECK-NEXT:    ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
 ; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
-; CHECK-NEXT:    st.global.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT:    st.global.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i64>, ptr addrspace(1) %a
   %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
@@ -1200,11 +1200,11 @@ define void @global_2xfloat(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_2xfloat_param_0];
-; CHECK-NEXT:    ld.global.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xfloat_param_0];
+; CHECK-NEXT:    ld.global.v2.b32 {%f1, %f2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.global.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    st.global.v2.b32 [%rd1], {%f4, %f3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x float>, ptr addrspace(1) %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -1219,13 +1219,13 @@ define void @global_4xfloat(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_4xfloat_param_0];
-; CHECK-NEXT:    ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_4xfloat_param_0];
+; CHECK-NEXT:    ld.global.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    st.global.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x float>, ptr addrspace(1) %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -1240,11 +1240,11 @@ define void @global_2xdouble(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_2xdouble_param_0];
-; CHECK-NEXT:    ld.global.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xdouble_param_0];
+; CHECK-NEXT:    ld.global.v2.b64 {%fd1, %fd2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
 ; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.global.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    st.global.v2.b64 [%rd1], {%fd4, %fd3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x double>, ptr addrspace(1) %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -1261,11 +1261,11 @@ define void @global_volatile_2xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_2xi8_param_0];
-; CHECK-NEXT:    ld.volatile.global.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xi8_param_0];
+; CHECK-NEXT:    ld.volatile.global.v2.b8 {%rs1, %rs2}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT:    st.volatile.global.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT:    st.volatile.global.v2.b8 [%rd1], {%rs4, %rs3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i8>, ptr addrspace(1) %a
   %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
@@ -1281,8 +1281,8 @@ define void @global_volatile_4xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_4xi8_param_0];
-; CHECK-NEXT:    ld.volatile.global.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_4xi8_param_0];
+; CHECK-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
@@ -1302,7 +1302,7 @@ define void @global_volatile_4xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
-; CHECK-NEXT:    st.volatile.global.u32 [%rd1], %r12;
+; CHECK-NEXT:    st.volatile.global.b32 [%rd1], %r12;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x i8>, ptr addrspace(1) %a
   %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
@@ -1318,7 +1318,7 @@ define void @global_volatile_8xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_8xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_8xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
@@ -1374,7 +1374,7 @@ define void @global_volatile_16xi8(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_16xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_16xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
@@ -1468,13 +1468,13 @@ define void @global_volatile_2xi16(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_2xi16_param_0];
-; CHECK-NEXT:    ld.volatile.global.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xi16_param_0];
+; CHECK-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
 ; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT:    st.volatile.global.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.volatile.global.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i16>, ptr addrspace(1) %a
   %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
@@ -1489,13 +1489,13 @@ define void @global_volatile_4xi16(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_4xi16_param_0];
-; CHECK-NEXT:    ld.volatile.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_4xi16_param_0];
+; CHECK-NEXT:    ld.volatile.global.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
 ; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
 ; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
-; CHECK-NEXT:    st.volatile.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT:    st.volatile.global.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x i16>, ptr addrspace(1) %a
   %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
@@ -1511,7 +1511,7 @@ define void @global_volatile_8xi16(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_8xi16_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_8xi16_param_0];
 ; CHECK-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
@@ -1544,11 +1544,11 @@ define void @global_volatile_2xi32(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_2xi32_param_0];
-; CHECK-NEXT:    ld.volatile.global.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xi32_param_0];
+; CHECK-NEXT:    ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r3, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r4, %r1, 1;
-; CHECK-NEXT:    st.volatile.global.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT:    st.volatile.global.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i32>, ptr addrspace(1) %a
   %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
@@ -1563,13 +1563,13 @@ define void @global_volatile_4xi32(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_4xi32_param_0];
-; CHECK-NEXT:    ld.volatile.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_4xi32_param_0];
+; CHECK-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r5, %r4, 1;
 ; CHECK-NEXT:    add.s32 %r6, %r3, 1;
 ; CHECK-NEXT:    add.s32 %r7, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r8, %r1, 1;
-; CHECK-NEXT:    st.volatile.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT:    st.volatile.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x i32>, ptr addrspace(1) %a
   %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
@@ -1583,11 +1583,11 @@ define void @global_volatile_2xi64(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_2xi64_param_0];
-; CHECK-NEXT:    ld.volatile.global.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xi64_param_0];
+; CHECK-NEXT:    ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
 ; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
-; CHECK-NEXT:    st.volatile.global.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT:    st.volatile.global.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i64>, ptr addrspace(1) %a
   %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
@@ -1602,11 +1602,11 @@ define void @global_volatile_2xfloat(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_2xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.global.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xfloat_param_0];
+; CHECK-NEXT:    ld.volatile.global.v2.b32 {%f1, %f2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.global.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    st.volatile.global.v2.b32 [%rd1], {%f4, %f3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x float>, ptr addrspace(1) %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -1621,13 +1621,13 @@ define void @global_volatile_4xfloat(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_4xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_4xfloat_param_0];
+; CHECK-NEXT:    ld.volatile.global.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    st.volatile.global.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x float>, ptr addrspace(1) %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -1642,11 +1642,11 @@ define void @global_volatile_2xdouble(ptr addrspace(1) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_2xdouble_param_0];
-; CHECK-NEXT:    ld.volatile.global.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xdouble_param_0];
+; CHECK-NEXT:    ld.volatile.global.v2.b64 {%fd1, %fd2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
 ; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.global.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    st.volatile.global.v2.b64 [%rd1], {%fd4, %fd3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x double>, ptr addrspace(1) %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -1665,11 +1665,11 @@ define void @shared_2xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_2xi8_param_0];
-; CHECK-NEXT:    ld.shared.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xi8_param_0];
+; CHECK-NEXT:    ld.shared.v2.b8 {%rs1, %rs2}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT:    st.shared.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT:    st.shared.v2.b8 [%rd1], {%rs4, %rs3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i8>, ptr addrspace(3) %a
   %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
@@ -1685,8 +1685,8 @@ define void @shared_4xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_4xi8_param_0];
-; CHECK-NEXT:    ld.shared.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_4xi8_param_0];
+; CHECK-NEXT:    ld.shared.b32 %r1, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
@@ -1706,7 +1706,7 @@ define void @shared_4xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
-; CHECK-NEXT:    st.shared.u32 [%rd1], %r12;
+; CHECK-NEXT:    st.shared.b32 [%rd1], %r12;
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x i8>, ptr addrspace(3) %a
   %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
@@ -1722,7 +1722,7 @@ define void @shared_8xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_8xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_8xi8_param_0];
 ; CHECK-NEXT:    ld.shared.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
@@ -1778,7 +1778,7 @@ define void @shared_16xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_16xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_16xi8_param_0];
 ; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
@@ -1872,13 +1872,13 @@ define void @shared_2xi16(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_2xi16_param_0];
-; CHECK-NEXT:    ld.shared.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xi16_param_0];
+; CHECK-NEXT:    ld.shared.b32 %r1, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
 ; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT:    st.shared.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.shared.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i16>, ptr addrspace(3) %a
   %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
@@ -1893,13 +1893,13 @@ define void @shared_4xi16(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_4xi16_param_0];
-; CHECK-NEXT:    ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_4xi16_param_0];
+; CHECK-NEXT:    ld.shared.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
 ; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
 ; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
-; CHECK-NEXT:    st.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT:    st.shared.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x i16>, ptr addrspace(3) %a
   %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
@@ -1915,7 +1915,7 @@ define void @shared_8xi16(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_8xi16_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_8xi16_param_0];
 ; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
@@ -1948,11 +1948,11 @@ define void @shared_2xi32(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_2xi32_param_0];
-; CHECK-NEXT:    ld.shared.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xi32_param_0];
+; CHECK-NEXT:    ld.shared.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r3, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r4, %r1, 1;
-; CHECK-NEXT:    st.shared.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT:    st.shared.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i32>, ptr addrspace(3) %a
   %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
@@ -1967,13 +1967,13 @@ define void @shared_4xi32(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_4xi32_param_0];
-; CHECK-NEXT:    ld.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_4xi32_param_0];
+; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r5, %r4, 1;
 ; CHECK-NEXT:    add.s32 %r6, %r3, 1;
 ; CHECK-NEXT:    add.s32 %r7, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r8, %r1, 1;
-; CHECK-NEXT:    st.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT:    st.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x i32>, ptr addrspace(3) %a
   %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
@@ -1987,11 +1987,11 @@ define void @shared_2xi64(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_2xi64_param_0];
-; CHECK-NEXT:    ld.shared.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xi64_param_0];
+; CHECK-NEXT:    ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
 ; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
-; CHECK-NEXT:    st.shared.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT:    st.shared.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i64>, ptr addrspace(3) %a
   %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
@@ -2006,11 +2006,11 @@ define void @shared_2xfloat(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_2xfloat_param_0];
-; CHECK-NEXT:    ld.shared.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xfloat_param_0];
+; CHECK-NEXT:    ld.shared.v2.b32 {%f1, %f2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.shared.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    st.shared.v2.b32 [%rd1], {%f4, %f3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x float>, ptr addrspace(3) %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -2025,13 +2025,13 @@ define void @shared_4xfloat(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_4xfloat_param_0];
-; CHECK-NEXT:    ld.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_4xfloat_param_0];
+; CHECK-NEXT:    ld.shared.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    st.shared.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x float>, ptr addrspace(3) %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -2046,11 +2046,11 @@ define void @shared_2xdouble(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_2xdouble_param_0];
-; CHECK-NEXT:    ld.shared.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xdouble_param_0];
+; CHECK-NEXT:    ld.shared.v2.b64 {%fd1, %fd2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
 ; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.shared.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    st.shared.v2.b64 [%rd1], {%fd4, %fd3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x double>, ptr addrspace(3) %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -2067,11 +2067,11 @@ define void @shared_volatile_2xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_2xi8_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xi8_param_0];
+; CHECK-NEXT:    ld.volatile.shared.v2.b8 {%rs1, %rs2}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT:    st.volatile.shared.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT:    st.volatile.shared.v2.b8 [%rd1], {%rs4, %rs3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i8>, ptr addrspace(3) %a
   %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
@@ -2087,8 +2087,8 @@ define void @shared_volatile_4xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_4xi8_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_4xi8_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
@@ -2108,7 +2108,7 @@ define void @shared_volatile_4xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
-; CHECK-NEXT:    st.volatile.shared.u32 [%rd1], %r12;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %r12;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x i8>, ptr addrspace(3) %a
   %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
@@ -2124,7 +2124,7 @@ define void @shared_volatile_8xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_8xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_8xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
@@ -2180,7 +2180,7 @@ define void @shared_volatile_16xi8(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_16xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_16xi8_param_0];
 ; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
@@ -2274,13 +2274,13 @@ define void @shared_volatile_2xi16(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_2xi16_param_0];
-; CHECK-NEXT:    ld.volatile.shared.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xi16_param_0];
+; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
 ; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT:    st.volatile.shared.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i16>, ptr addrspace(3) %a
   %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
@@ -2295,13 +2295,13 @@ define void @shared_volatile_4xi16(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_4xi16_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_4xi16_param_0];
+; CHECK-NEXT:    ld.volatile.shared.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
 ; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
 ; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
-; CHECK-NEXT:    st.volatile.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT:    st.volatile.shared.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x i16>, ptr addrspace(3) %a
   %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
@@ -2317,7 +2317,7 @@ define void @shared_volatile_8xi16(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_8xi16_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_8xi16_param_0];
 ; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
@@ -2350,11 +2350,11 @@ define void @shared_volatile_2xi32(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_2xi32_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xi32_param_0];
+; CHECK-NEXT:    ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r3, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r4, %r1, 1;
-; CHECK-NEXT:    st.volatile.shared.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT:    st.volatile.shared.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i32>, ptr addrspace(3) %a
   %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
@@ -2369,13 +2369,13 @@ define void @shared_volatile_4xi32(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_4xi32_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_4xi32_param_0];
+; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r5, %r4, 1;
 ; CHECK-NEXT:    add.s32 %r6, %r3, 1;
 ; CHECK-NEXT:    add.s32 %r7, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r8, %r1, 1;
-; CHECK-NEXT:    st.volatile.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x i32>, ptr addrspace(3) %a
   %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
@@ -2389,11 +2389,11 @@ define void @shared_volatile_2xi64(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_2xi64_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xi64_param_0];
+; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
 ; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
-; CHECK-NEXT:    st.volatile.shared.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i64>, ptr addrspace(3) %a
   %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
@@ -2408,11 +2408,11 @@ define void @shared_volatile_2xfloat(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_2xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xfloat_param_0];
+; CHECK-NEXT:    ld.volatile.shared.v2.b32 {%f1, %f2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.shared.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    st.volatile.shared.v2.b32 [%rd1], {%f4, %f3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x float>, ptr addrspace(3) %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -2427,13 +2427,13 @@ define void @shared_volatile_4xfloat(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_4xfloat_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_4xfloat_param_0];
+; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.volatile.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x float>, ptr addrspace(3) %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -2448,11 +2448,11 @@ define void @shared_volatile_2xdouble(ptr addrspace(3) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_2xdouble_param_0];
-; CHECK-NEXT:    ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xdouble_param_0];
+; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%fd1, %fd2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
 ; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.volatile.shared.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd1], {%fd4, %fd3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x double>, ptr addrspace(3) %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -2471,11 +2471,11 @@ define void @local_2xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_2xi8_param_0];
-; CHECK-NEXT:    ld.local.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xi8_param_0];
+; CHECK-NEXT:    ld.local.v2.b8 {%rs1, %rs2}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT:    st.local.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT:    st.local.v2.b8 [%rd1], {%rs4, %rs3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i8>, ptr addrspace(5) %a
   %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
@@ -2491,8 +2491,8 @@ define void @local_4xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_4xi8_param_0];
-; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_4xi8_param_0];
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
@@ -2512,7 +2512,7 @@ define void @local_4xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
-; CHECK-NEXT:    st.local.u32 [%rd1], %r12;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r12;
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x i8>, ptr addrspace(5) %a
   %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
@@ -2528,7 +2528,7 @@ define void @local_8xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_8xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_8xi8_param_0];
 ; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
@@ -2584,7 +2584,7 @@ define void @local_16xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_16xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_16xi8_param_0];
 ; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
@@ -2678,13 +2678,13 @@ define void @local_2xi16(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_2xi16_param_0];
-; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xi16_param_0];
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
 ; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT:    st.local.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i16>, ptr addrspace(5) %a
   %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
@@ -2699,13 +2699,13 @@ define void @local_4xi16(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_4xi16_param_0];
-; CHECK-NEXT:    ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_4xi16_param_0];
+; CHECK-NEXT:    ld.local.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
 ; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
 ; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
-; CHECK-NEXT:    st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT:    st.local.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x i16>, ptr addrspace(5) %a
   %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
@@ -2721,7 +2721,7 @@ define void @local_8xi16(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_8xi16_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_8xi16_param_0];
 ; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
@@ -2754,11 +2754,11 @@ define void @local_2xi32(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_2xi32_param_0];
-; CHECK-NEXT:    ld.local.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xi32_param_0];
+; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r3, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r4, %r1, 1;
-; CHECK-NEXT:    st.local.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i32>, ptr addrspace(5) %a
   %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
@@ -2773,13 +2773,13 @@ define void @local_4xi32(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_4xi32_param_0];
-; CHECK-NEXT:    ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_4xi32_param_0];
+; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r5, %r4, 1;
 ; CHECK-NEXT:    add.s32 %r6, %r3, 1;
 ; CHECK-NEXT:    add.s32 %r7, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r8, %r1, 1;
-; CHECK-NEXT:    st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x i32>, ptr addrspace(5) %a
   %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
@@ -2793,11 +2793,11 @@ define void @local_2xi64(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_2xi64_param_0];
-; CHECK-NEXT:    ld.local.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xi64_param_0];
+; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
 ; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
-; CHECK-NEXT:    st.local.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT:    st.local.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x i64>, ptr addrspace(5) %a
   %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
@@ -2812,11 +2812,11 @@ define void @local_2xfloat(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_2xfloat_param_0];
-; CHECK-NEXT:    ld.local.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xfloat_param_0];
+; CHECK-NEXT:    ld.local.v2.b32 {%f1, %f2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%f4, %f3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x float>, ptr addrspace(5) %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -2831,13 +2831,13 @@ define void @local_4xfloat(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_4xfloat_param_0];
-; CHECK-NEXT:    ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_4xfloat_param_0];
+; CHECK-NEXT:    ld.local.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
 ; CHECK-NEXT:    ret;
   %a.load = load <4 x float>, ptr addrspace(5) %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -2852,11 +2852,11 @@ define void @local_2xdouble(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_2xdouble_param_0];
-; CHECK-NEXT:    ld.local.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xdouble_param_0];
+; CHECK-NEXT:    ld.local.v2.b64 {%fd1, %fd2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
 ; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    st.local.v2.b64 [%rd1], {%fd4, %fd3};
 ; CHECK-NEXT:    ret;
   %a.load = load <2 x double>, ptr addrspace(5) %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
@@ -2873,11 +2873,11 @@ define void @local_volatile_2xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_2xi8_param_0];
-; CHECK-NEXT:    ld.local.v2.u8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xi8_param_0];
+; CHECK-NEXT:    ld.local.v2.b8 {%rs1, %rs2}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT:    st.local.v2.u8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT:    st.local.v2.b8 [%rd1], {%rs4, %rs3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i8>, ptr addrspace(5) %a
   %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
@@ -2893,8 +2893,8 @@ define void @local_volatile_4xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_4xi8_param_0];
-; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_4xi8_param_0];
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
@@ -2914,7 +2914,7 @@ define void @local_volatile_4xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
 ; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
-; CHECK-NEXT:    st.local.u32 [%rd1], %r12;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r12;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x i8>, ptr addrspace(5) %a
   %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
@@ -2930,7 +2930,7 @@ define void @local_volatile_8xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_8xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_8xi8_param_0];
 ; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
@@ -2986,7 +2986,7 @@ define void @local_volatile_16xi8(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_16xi8_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_16xi8_param_0];
 ; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
@@ -3080,13 +3080,13 @@ define void @local_volatile_2xi16(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_2xi16_param_0];
-; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xi16_param_0];
+; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
 ; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
-; CHECK-NEXT:    st.local.u32 [%rd1], %r2;
+; CHECK-NEXT:    st.local.b32 [%rd1], %r2;
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i16>, ptr addrspace(5) %a
   %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
@@ -3101,13 +3101,13 @@ define void @local_volatile_4xi16(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_4xi16_param_0];
-; CHECK-NEXT:    ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_4xi16_param_0];
+; CHECK-NEXT:    ld.local.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
 ; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
 ; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
 ; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
 ; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
-; CHECK-NEXT:    st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT:    st.local.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x i16>, ptr addrspace(5) %a
   %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
@@ -3123,7 +3123,7 @@ define void @local_volatile_8xi16(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_8xi16_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_8xi16_param_0];
 ; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
@@ -3156,11 +3156,11 @@ define void @local_volatile_2xi32(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_2xi32_param_0];
-; CHECK-NEXT:    ld.local.v2.u32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xi32_param_0];
+; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r3, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r4, %r1, 1;
-; CHECK-NEXT:    st.local.v2.u32 [%rd1], {%r4, %r3};
+; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%r4, %r3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i32>, ptr addrspace(5) %a
   %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
@@ -3175,13 +3175,13 @@ define void @local_volatile_4xi32(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_4xi32_param_0];
-; CHECK-NEXT:    ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_4xi32_param_0];
+; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
 ; CHECK-NEXT:    add.s32 %r5, %r4, 1;
 ; CHECK-NEXT:    add.s32 %r6, %r3, 1;
 ; CHECK-NEXT:    add.s32 %r7, %r2, 1;
 ; CHECK-NEXT:    add.s32 %r8, %r1, 1;
-; CHECK-NEXT:    st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
+; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x i32>, ptr addrspace(5) %a
   %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
@@ -3195,11 +3195,11 @@ define void @local_volatile_2xi64(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_2xi64_param_0];
-; CHECK-NEXT:    ld.local.v2.u64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xi64_param_0];
+; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
 ; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
 ; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
-; CHECK-NEXT:    st.local.v2.u64 [%rd1], {%rd5, %rd4};
+; CHECK-NEXT:    st.local.v2.b64 [%rd1], {%rd5, %rd4};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x i64>, ptr addrspace(5) %a
   %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
@@ -3214,11 +3214,11 @@ define void @local_volatile_2xfloat(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_2xfloat_param_0];
-; CHECK-NEXT:    ld.local.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xfloat_param_0];
+; CHECK-NEXT:    ld.local.v2.b32 {%f1, %f2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.v2.f32 [%rd1], {%f4, %f3};
+; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%f4, %f3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x float>, ptr addrspace(5) %a
   %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
@@ -3233,13 +3233,13 @@ define void @local_volatile_4xfloat(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_4xfloat_param_0];
-; CHECK-NEXT:    ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_4xfloat_param_0];
+; CHECK-NEXT:    ld.local.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
 ; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
-; CHECK-NEXT:    st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
+; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%f8, %f7, %f6, %f5};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <4 x float>, ptr addrspace(5) %a
   %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
@@ -3254,11 +3254,11 @@ define void @local_volatile_2xdouble(ptr addrspace(5) %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_2xdouble_param_0];
-; CHECK-NEXT:    ld.local.v2.f64 {%fd1, %fd2}, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xdouble_param_0];
+; CHECK-NEXT:    ld.local.v2.b64 {%fd1, %fd2}, [%rd1];
 ; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
 ; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
-; CHECK-NEXT:    st.local.v2.f64 [%rd1], {%fd4, %fd3};
+; CHECK-NEXT:    st.local.v2.b64 [%rd1], {%fd4, %fd3};
 ; CHECK-NEXT:    ret;
   %a.load = load volatile <2 x double>, ptr addrspace(5) %a
   %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
index 6a34135a31783..4d7a4b50e8940 100644
--- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
+++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -7,9 +7,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "nvptx64-unknown-unknown"
 
 ; SM20-LABEL: .visible .entry foo1(
-; SM20: ld.global.f32
+; SM20: ld.global.b32
 ; SM35-LABEL: .visible .entry foo1(
-; SM35: ld.global.nc.f32
+; SM35: ld.global.nc.b32
 define ptx_kernel void @foo1(ptr noalias readonly %from, ptr %to) {
   %1 = load float, ptr %from
   store float %1, ptr %to
@@ -17,9 +17,9 @@ define ptx_kernel void @foo1(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo2(
-; SM20: ld.global.f64
+; SM20: ld.global.b64
 ; SM35-LABEL: .visible .entry foo2(
-; SM35: ld.global.nc.f64
+; SM35: ld.global.nc.b64
 define ptx_kernel void @foo2(ptr noalias readonly %from, ptr %to) {
   %1 = load double, ptr %from
   store double %1, ptr %to
@@ -27,9 +27,9 @@ define ptx_kernel void @foo2(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo3(
-; SM20: ld.global.u16
+; SM20: ld.global.b16
 ; SM35-LABEL: .visible .entry foo3(
-; SM35: ld.global.nc.u16
+; SM35: ld.global.nc.b16
 define ptx_kernel void @foo3(ptr noalias readonly %from, ptr %to) {
   %1 = load i16, ptr %from
   store i16 %1, ptr %to
@@ -37,9 +37,9 @@ define ptx_kernel void @foo3(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo4(
-; SM20: ld.global.u32
+; SM20: ld.global.b32
 ; SM35-LABEL: .visible .entry foo4(
-; SM35: ld.global.nc.u32
+; SM35: ld.global.nc.b32
 define ptx_kernel void @foo4(ptr noalias readonly %from, ptr %to) {
   %1 = load i32, ptr %from
   store i32 %1, ptr %to
@@ -47,9 +47,9 @@ define ptx_kernel void @foo4(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo5(
-; SM20: ld.global.u64
+; SM20: ld.global.b64
 ; SM35-LABEL: .visible .entry foo5(
-; SM35: ld.global.nc.u64
+; SM35: ld.global.nc.b64
 define ptx_kernel void @foo5(ptr noalias readonly %from, ptr %to) {
   %1 = load i64, ptr %from
   store i64 %1, ptr %to
@@ -58,9 +58,9 @@ define ptx_kernel void @foo5(ptr noalias readonly %from, ptr %to) {
 
 ; i128 is non standard integer in nvptx64
 ; SM20-LABEL: .visible .entry foo6(
-; SM20: ld.global.v2.u64
+; SM20: ld.global.v2.b64
 ; SM35-LABEL: .visible .entry foo6(
-; SM35: ld.global.nc.v2.u64
+; SM35: ld.global.nc.v2.b64
 define ptx_kernel void @foo6(ptr noalias readonly %from, ptr %to) {
   %1 = load i128, ptr %from
   store i128 %1, ptr %to
@@ -68,9 +68,9 @@ define ptx_kernel void @foo6(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo7(
-; SM20: ld.global.v2.u8
+; SM20: ld.global.v2.b8
 ; SM35-LABEL: .visible .entry foo7(
-; SM35: ld.global.nc.v2.u8
+; SM35: ld.global.nc.v2.b8
 define ptx_kernel void @foo7(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i8>, ptr %from
   store <2 x i8> %1, ptr %to
@@ -78,9 +78,9 @@ define ptx_kernel void @foo7(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo8(
-; SM20: ld.global.u32
+; SM20: ld.global.b32
 ; SM35-LABEL: .visible .entry foo8(
-; SM35: ld.global.nc.u32
+; SM35: ld.global.nc.b32
 define ptx_kernel void @foo8(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i16>, ptr %from
   store <2 x i16> %1, ptr %to
@@ -88,9 +88,9 @@ define ptx_kernel void @foo8(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo9(
-; SM20: ld.global.v2.u32
+; SM20: ld.global.v2.b32
 ; SM35-LABEL: .visible .entry foo9(
-; SM35: ld.global.nc.v2.u32
+; SM35: ld.global.nc.v2.b32
 define ptx_kernel void @foo9(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i32>, ptr %from
   store <2 x i32> %1, ptr %to
@@ -98,9 +98,9 @@ define ptx_kernel void @foo9(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo10(
-; SM20: ld.global.v2.u64
+; SM20: ld.global.v2.b64
 ; SM35-LABEL: .visible .entry foo10(
-; SM35: ld.global.nc.v2.u64
+; SM35: ld.global.nc.v2.b64
 define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x i64>, ptr %from
   store <2 x i64> %1, ptr %to
@@ -108,9 +108,9 @@ define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo11(
-; SM20: ld.global.v2.f32
+; SM20: ld.global.v2.b32
 ; SM35-LABEL: .visible .entry foo11(
-; SM35: ld.global.nc.v2.f32
+; SM35: ld.global.nc.v2.b32
 define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x float>, ptr %from
   store <2 x float> %1, ptr %to
@@ -118,9 +118,9 @@ define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo12(
-; SM20: ld.global.v2.f64
+; SM20: ld.global.v2.b64
 ; SM35-LABEL: .visible .entry foo12(
-; SM35: ld.global.nc.v2.f64
+; SM35: ld.global.nc.v2.b64
 define ptx_kernel void @foo12(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x double>, ptr %from
   store <2 x double> %1, ptr %to
@@ -128,9 +128,9 @@ define ptx_kernel void @foo12(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo13(
-; SM20: ld.global.u32
+; SM20: ld.global.b32
 ; SM35-LABEL: .visible .entry foo13(
-; SM35: ld.global.nc.u32
+; SM35: ld.global.nc.b32
 define ptx_kernel void @foo13(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x i8>, ptr %from
   store <4 x i8> %1, ptr %to
@@ -138,9 +138,9 @@ define ptx_kernel void @foo13(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo14(
-; SM20: ld.global.v4.u16
+; SM20: ld.global.v4.b16
 ; SM35-LABEL: .visible .entry foo14(
-; SM35: ld.global.nc.v4.u16
+; SM35: ld.global.nc.v4.b16
 define ptx_kernel void @foo14(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x i16>, ptr %from
   store <4 x i16> %1, ptr %to
@@ -148,9 +148,9 @@ define ptx_kernel void @foo14(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo15(
-; SM20: ld.global.v4.u32
+; SM20: ld.global.v4.b32
 ; SM35-LABEL: .visible .entry foo15(
-; SM35: ld.global.nc.v4.u32
+; SM35: ld.global.nc.v4.b32
 define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x i32>, ptr %from
   store <4 x i32> %1, ptr %to
@@ -158,9 +158,9 @@ define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo16(
-; SM20: ld.global.v4.f32
+; SM20: ld.global.v4.b32
 ; SM35-LABEL: .visible .entry foo16(
-; SM35: ld.global.nc.v4.f32
+; SM35: ld.global.nc.v4.b32
 define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x float>, ptr %from
   store <4 x float> %1, ptr %to
@@ -168,11 +168,11 @@ define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo17(
-; SM20: ld.global.v2.f64
-; SM20: ld.global.v2.f64
+; SM20: ld.global.v2.b64
+; SM20: ld.global.v2.b64
 ; SM35-LABEL: .visible .entry foo17(
-; SM35: ld.global.nc.v2.f64
-; SM35: ld.global.nc.v2.f64
+; SM35: ld.global.nc.v2.b64
+; SM35: ld.global.nc.v2.b64
 define ptx_kernel void @foo17(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x double>, ptr %from
   store <4 x double> %1, ptr %to
@@ -180,9 +180,9 @@ define ptx_kernel void @foo17(ptr noalias readonly %from, ptr %to) {
 }
 
 ; SM20-LABEL: .visible .entry foo18(
-; SM20: ld.global.u64
+; SM20: ld.global.b64
 ; SM35-LABEL: .visible .entry foo18(
-; SM35: ld.global.nc.u64
+; SM35: ld.global.nc.b64
 define ptx_kernel void @foo18(ptr noalias readonly %from, ptr %to) {
   %1 = load ptr, ptr %from
   store ptr %1, ptr %to
@@ -191,9 +191,9 @@ define ptx_kernel void @foo18(ptr noalias readonly %from, ptr %to) {
 
 ; Test that we can infer a cached load for a pointer induction variable.
 ; SM20-LABEL: .visible .entry foo19(
-; SM20: ld.global.f32
+; SM20: ld.global.b32
 ; SM35-LABEL: .visible .entry foo19(
-; SM35: ld.global.nc.f32
+; SM35: ld.global.nc.b32
 define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) {
 entry:
   br label %loop
@@ -219,9 +219,9 @@ exit:
 ; pointed-to memory is never written to (for the duration of the
 ; kernel). For both reasons, we cannot use a cached load here.
 ; SM20-LABEL: notkernel(
-; SM20: ld.f32
+; SM20: ld.b32
 ; SM35-LABEL: notkernel(
-; SM35: ld.f32
+; SM35: ld.b32
 define void @notkernel(ptr noalias readonly %from, ptr %to) {
   %1 = load float, ptr %from
   store float %1, ptr %to
@@ -233,9 +233,9 @@ define void @notkernel(ptr noalias readonly %from, ptr %to) {
 ; kernel). This case does not currently come up normally since we do not infer
 ; that pointers are global interprocedurally as of 2015-08-05.
 ; SM20-LABEL: notkernel2(
-; SM20: ld.global.f32
+; SM20: ld.global.b32
 ; SM35-LABEL: notkernel2(
-; SM35: ld.global.f32
+; SM35: ld.global.b32
 define void @notkernel2(ptr addrspace(1) noalias readonly %from, ptr %to) {
   %1 = load float, ptr addrspace(1) %from
   store float %1, ptr %to
diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
index f49053485fa29..2bfd891a04a17 100644
--- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -16,9 +16,9 @@ define void @foo(i32 %a) {
 ; PTX32-EMPTY:
 ; PTX32-NEXT:  // %bb.0:
 ; PTX32-NEXT:    mov.b32 %SPL, __local_depot0;
-; PTX32-NEXT:    ld.param.u32 %r1, [foo_param_0];
+; PTX32-NEXT:    ld.param.b32 %r1, [foo_param_0];
 ; PTX32-NEXT:    add.u32 %r3, %SPL, 0;
-; PTX32-NEXT:    st.local.u32 [%r3], %r1;
+; PTX32-NEXT:    st.local.b32 [%r3], %r1;
 ; PTX32-NEXT:    ret;
 ;
 ; PTX64-LABEL: foo(
@@ -31,9 +31,9 @@ define void @foo(i32 %a) {
 ; PTX64-EMPTY:
 ; PTX64-NEXT:  // %bb.0:
 ; PTX64-NEXT:    mov.b64 %SPL, __local_depot0;
-; PTX64-NEXT:    ld.param.u32 %r1, [foo_param_0];
+; PTX64-NEXT:    ld.param.b32 %r1, [foo_param_0];
 ; PTX64-NEXT:    add.u64 %rd2, %SPL, 0;
-; PTX64-NEXT:    st.local.u32 [%rd2], %r1;
+; PTX64-NEXT:    st.local.b32 [%rd2], %r1;
 ; PTX64-NEXT:    ret;
   %local = alloca i32, align 4
   store volatile i32 %a, ptr %local
@@ -51,10 +51,10 @@ define ptx_kernel void @foo2(i32 %a) {
 ; PTX32-NEXT:  // %bb.0:
 ; PTX32-NEXT:    mov.b32 %SPL, __local_depot1;
 ; PTX32-NEXT:    cvta.local.u32 %SP, %SPL;
-; PTX32-NEXT:    ld.param.u32 %r1, [foo2_param_0];
+; PTX32-NEXT:    ld.param.b32 %r1, [foo2_param_0];
 ; PTX32-NEXT:    add.u32 %r2, %SP, 0;
 ; PTX32-NEXT:    add.u32 %r3, %SPL, 0;
-; PTX32-NEXT:    st.local.u32 [%r3], %r1;
+; PTX32-NEXT:    st.local.b32 [%r3], %r1;
 ; PTX32-NEXT:    { // callseq 0, 0
 ; PTX32-NEXT:    .param .b32 param0;
 ; PTX32-NEXT:    st.param.b32 [param0], %r2;
@@ -77,10 +77,10 @@ define ptx_kernel void @foo2(i32 %a) {
 ; PTX64-NEXT:  // %bb.0:
 ; PTX64-NEXT:    mov.b64 %SPL, __local_depot1;
 ; PTX64-NEXT:    cvta.local.u64 %SP, %SPL;
-; PTX64-NEXT:    ld.param.u32 %r1, [foo2_param_0];
+; PTX64-NEXT:    ld.param.b32 %r1, [foo2_param_0];
 ; PTX64-NEXT:    add.u64 %rd1, %SP, 0;
 ; PTX64-NEXT:    add.u64 %rd2, %SPL, 0;
-; PTX64-NEXT:    st.local.u32 [%rd2], %r1;
+; PTX64-NEXT:    st.local.b32 [%rd2], %r1;
 ; PTX64-NEXT:    { // callseq 0, 0
 ; PTX64-NEXT:    .param .b64 param0;
 ; PTX64-NEXT:    st.param.b64 [param0], %rd1;
@@ -109,11 +109,11 @@ define void @foo3(i32 %a) {
 ; PTX32-EMPTY:
 ; PTX32-NEXT:  // %bb.0:
 ; PTX32-NEXT:    mov.b32 %SPL, __local_depot2;
-; PTX32-NEXT:    ld.param.u32 %r1, [foo3_param_0];
+; PTX32-NEXT:    ld.param.b32 %r1, [foo3_param_0];
 ; PTX32-NEXT:    add.u32 %r3, %SPL, 0;
 ; PTX32-NEXT:    shl.b32 %r4, %r1, 2;
 ; PTX32-NEXT:    add.s32 %r5, %r3, %r4;
-; PTX32-NEXT:    st.local.u32 [%r5], %r1;
+; PTX32-NEXT:    st.local.b32 [%r5], %r1;
 ; PTX32-NEXT:    ret;
 ;
 ; PTX64-LABEL: foo3(
@@ -126,11 +126,11 @@ define void @foo3(i32 %a) {
 ; PTX64-EMPTY:
 ; PTX64-NEXT:  // %bb.0:
 ; PTX64-NEXT:    mov.b64 %SPL, __local_depot2;
-; PTX64-NEXT:    ld.param.u32 %r1, [foo3_param_0];
+; PTX64-NEXT:    ld.param.b32 %r1, [foo3_param_0];
 ; PTX64-NEXT:    add.u64 %rd2, %SPL, 0;
 ; PTX64-NEXT:    mul.wide.s32 %rd3, %r1, 4;
 ; PTX64-NEXT:    add.s64 %rd4, %rd2, %rd3;
-; PTX64-NEXT:    st.local.u32 [%rd4], %r1;
+; PTX64-NEXT:    st.local.b32 [%rd4], %r1;
 ; PTX64-NEXT:    ret;
   %local = alloca [3 x i32], align 4
   %1 = getelementptr inbounds i32, ptr %local, i32 %a
@@ -154,8 +154,8 @@ define void @foo4() {
 ; PTX32-NEXT:    add.u32 %r3, %SP, 4;
 ; PTX32-NEXT:    add.u32 %r4, %SPL, 4;
 ; PTX32-NEXT:    mov.b32 %r5, 0;
-; PTX32-NEXT:    st.local.u32 [%r2], %r5;
-; PTX32-NEXT:    st.local.u32 [%r4], %r5;
+; PTX32-NEXT:    st.local.b32 [%r2], %r5;
+; PTX32-NEXT:    st.local.b32 [%r4], %r5;
 ; PTX32-NEXT:    { // callseq 1, 0
 ; PTX32-NEXT:    .param .b32 param0;
 ; PTX32-NEXT:    st.param.b32 [param0], %r1;
@@ -192,8 +192,8 @@ define void @foo4() {
 ; PTX64-NEXT:    add.u64 %rd3, %SP, 4;
 ; PTX64-NEXT:    add.u64 %rd4, %SPL, 4;
 ; PTX64-NEXT:    mov.b32 %r1, 0;
-; PTX64-NEXT:    st.local.u32 [%rd2], %r1;
-; PTX64-NEXT:    st.local.u32 [%rd4], %r1;
+; PTX64-NEXT:    st.local.b32 [%rd2], %r1;
+; PTX64-NEXT:    st.local.b32 [%rd4], %r1;
 ; PTX64-NEXT:    { // callseq 1, 0
 ; PTX64-NEXT:    .param .b64 param0;
 ; PTX64-NEXT:    st.param.b64 [param0], %rd1;
diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
index 720c95b51358c..99212fc0dff79 100644
--- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -37,8 +37,8 @@ entry:
 
 ; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memcpy_caller
 ; PTX:        $L__BB[[LABEL:[_0-9]+]]:
-; PTX:        ld.u8 %rs[[REG:[0-9]+]]
-; PTX:        st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
+; PTX:        ld.b8 %rs[[REG:[0-9]+]]
+; PTX:        st.b8 [%rd{{[0-9]+}}], %rs[[REG]]
 ; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
 ; PTX:        setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
 ; PTX:        @%p[[PRED]] bra $L__BB[[LABEL]]
@@ -71,8 +71,8 @@ entry:
 
 ; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memcpy_volatile_caller
 ; PTX:        $L__BB[[LABEL:[_0-9]+]]:
-; PTX:        ld.volatile.u8 %rs[[REG:[0-9]+]]
-; PTX:        st.volatile.u8 [%rd{{[0-9]+}}], %rs[[REG]]
+; PTX:        ld.volatile.b8 %rs[[REG:[0-9]+]]
+; PTX:        st.volatile.b8 [%rd{{[0-9]+}}], %rs[[REG]]
 ; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
 ; PTX:        setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
 ; PTX:        @%p[[PRED]] bra $L__BB[[LABEL]]
@@ -124,10 +124,10 @@ entry:
 ; IR-NEXT:    store i8 [[VAL]], ptr [[STOREPTR]]
 
 ; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memset_caller(
-; PTX:        ld.param.u32 %r[[C:[0-9]+]]
+; PTX:        ld.param.b32 %r[[C:[0-9]+]]
 ; PTX:        cvt.u16.u32  %rs[[REG:[0-9]+]], %r[[C]];
 ; PTX:        $L__BB[[LABEL:[_0-9]+]]:
-; PTX:        st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
+; PTX:        st.b8 [%rd{{[0-9]+}}], %rs[[REG]]
 ; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
 ; PTX:        setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
 ; PTX:        @%p[[PRED]] bra $L__BB[[LABEL]]
@@ -159,20 +159,20 @@ entry:
 ; IR:         {{%[0-9a-zA-Z_]+}} = add i64 [[FWDPHIVAL]], 1
 
 ; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memmove_caller(
-; PTX:        ld.param.u64 %rd[[N:[0-9]+]]
+; PTX:        ld.param.b64 %rd[[N:[0-9]+]]
 ; PTX-DAG:    setp.eq.s64 %p[[NEQ0:[0-9]+]], %rd[[N]], 0
 ; PTX-DAG:    setp.ge.u64 %p[[SRC_GT_THAN_DST:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; PTX-NEXT:   @%p[[SRC_GT_THAN_DST]] bra $L__BB[[FORWARD_BB:[0-9_]+]]
 ; -- this is the backwards copying BB
 ; PTX:        @%p[[NEQ0]] bra $L__BB[[EXIT:[0-9_]+]]
 ; PTX:        add.s64 %rd{{[0-9]}}, %rd{{[0-9]}}, -1
-; PTX:        ld.u8 %rs[[ELEMENT:[0-9]+]]
-; PTX:        st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT]]
+; PTX:        ld.b8 %rs[[ELEMENT:[0-9]+]]
+; PTX:        st.b8 [%rd{{[0-9]+}}], %rs[[ELEMENT]]
 ; -- this is the forwards copying BB
 ; PTX:        $L__BB[[FORWARD_BB]]:
 ; PTX:        @%p[[NEQ0]] bra $L__BB[[EXIT]]
-; PTX:        ld.u8 %rs[[ELEMENT2:[0-9]+]]
-; PTX:        st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]]
+; PTX:        ld.b8 %rs[[ELEMENT2:[0-9]+]]
+; PTX:        st.b8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]]
 ; PTX:        add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 1
 ; -- exit block
 ; PTX:        $L__BB[[EXIT]]:
diff --git a/llvm/test/CodeGen/NVPTX/lower-alloca.ll b/llvm/test/CodeGen/NVPTX/lower-alloca.ll
index 530b48b3d3e37..489bcf4a7d55c 100644
--- a/llvm/test/CodeGen/NVPTX/lower-alloca.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-alloca.ll
@@ -15,7 +15,7 @@ define ptx_kernel void @kernel() {
 ; LOWERALLOCAONLY: [[V1:%.*]] = addrspacecast ptr %A to ptr addrspace(5)
 ; LOWERALLOCAONLY: [[V2:%.*]] = addrspacecast ptr addrspace(5) [[V1]] to ptr
 ; LOWERALLOCAONLY: store i32 0, ptr [[V2]], align 4
-; PTX: st.local.u32 [{{%rd[0-9]+}}], {{%r[0-9]+}}
+; PTX: st.local.b32 [{{%rd[0-9]+}}], {{%r[0-9]+}}
   store i32 0, ptr %A
   call void @callee(ptr %A)
   ret void
@@ -26,7 +26,7 @@ define void @alloca_in_explicit_local_as() {
 ; PTX-LABEL: .visible .func alloca_in_explicit_local_as(
   %A = alloca i32, addrspace(5)
 ; CHECK: store i32 0, ptr addrspace(5) {{%.+}}
-; PTX: st.local.u32 [%SP], {{%r[0-9]+}}
+; PTX: st.local.b32 [%SP], {{%r[0-9]+}}
 ; LOWERALLOCAONLY: [[V1:%.*]] = addrspacecast ptr addrspace(5) %A to ptr
 ; LOWERALLOCAONLY: store i32 0, ptr [[V1]], align 4
   store i32 0, ptr addrspace(5) %A
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index dd172cf685380..c3f94455b3038 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -28,7 +28,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.b64 %rd1, non_kernel_function_param_0;
 ; PTX-NEXT:    cvta.local.u64 %rd2, %rd1;
-; PTX-NEXT:    ld.param.u8 %rs1, [non_kernel_function_param_1];
+; PTX-NEXT:    ld.param.b8 %rs1, [non_kernel_function_param_1];
 ; PTX-NEXT:    and.b16 %rs2, %rs1, 1;
 ; PTX-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; PTX-NEXT:    mov.b64 %rd3, gi;
@@ -36,13 +36,13 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
 ; PTX-NEXT:    selp.b64 %rd5, %rd2, %rd4, %p1;
 ; PTX-NEXT:    ld.param.s32 %rd6, [non_kernel_function_param_2];
 ; PTX-NEXT:    add.s64 %rd7, %rd5, %rd6;
-; PTX-NEXT:    ld.u8 %r1, [%rd7];
-; PTX-NEXT:    ld.u8 %r2, [%rd7+1];
+; PTX-NEXT:    ld.b8 %r1, [%rd7];
+; PTX-NEXT:    ld.b8 %r2, [%rd7+1];
 ; PTX-NEXT:    shl.b32 %r3, %r2, 8;
 ; PTX-NEXT:    or.b32 %r4, %r3, %r1;
-; PTX-NEXT:    ld.u8 %r5, [%rd7+2];
+; PTX-NEXT:    ld.b8 %r5, [%rd7+2];
 ; PTX-NEXT:    shl.b32 %r6, %r5, 16;
-; PTX-NEXT:    ld.u8 %r7, [%rd7+3];
+; PTX-NEXT:    ld.b8 %r7, [%rd7+3];
 ; PTX-NEXT:    shl.b32 %r8, %r7, 24;
 ; PTX-NEXT:    or.b32 %r9, %r8, %r6;
 ; PTX-NEXT:    or.b32 %r10, %r9, %r4;
@@ -63,12 +63,12 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu
 ; PTX-NEXT:    .reg .b64 %rd<3>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.param.u64 %rd1, [grid_const_int_param_2];
+; PTX-NEXT:    ld.param.b64 %rd1, [grid_const_int_param_2];
 ; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
-; PTX-NEXT:    ld.param.u32 %r1, [grid_const_int_param_1];
-; PTX-NEXT:    ld.param.u32 %r2, [grid_const_int_param_0];
+; PTX-NEXT:    ld.param.b32 %r1, [grid_const_int_param_1];
+; PTX-NEXT:    ld.param.b32 %r2, [grid_const_int_param_0];
 ; PTX-NEXT:    add.s32 %r3, %r2, %r1;
-; PTX-NEXT:    st.global.u32 [%rd2], %r3;
+; PTX-NEXT:    st.global.b32 [%rd2], %r3;
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_int(
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
@@ -92,12 +92,12 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p
 ; PTX-NEXT:    .reg .b64 %rd<3>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.param.u64 %rd1, [grid_const_struct_param_1];
+; PTX-NEXT:    ld.param.b64 %rd1, [grid_const_struct_param_1];
 ; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
-; PTX-NEXT:    ld.param.u32 %r1, [grid_const_struct_param_0];
-; PTX-NEXT:    ld.param.u32 %r2, [grid_const_struct_param_0+4];
+; PTX-NEXT:    ld.param.b32 %r1, [grid_const_struct_param_0];
+; PTX-NEXT:    ld.param.b32 %r2, [grid_const_struct_param_0+4];
 ; PTX-NEXT:    add.s32 %r3, %r1, %r2;
-; PTX-NEXT:    st.global.u32 [%rd2], %r3;
+; PTX-NEXT:    st.global.b32 [%rd2], %r3;
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_struct(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
@@ -165,13 +165,13 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
 ; PTX-NEXT:    mov.b64 %SPL, __local_depot4;
 ; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; PTX-NEXT:    mov.b64 %rd2, multiple_grid_const_escape_param_0;
-; PTX-NEXT:    ld.param.u32 %r1, [multiple_grid_const_escape_param_1];
+; PTX-NEXT:    ld.param.b32 %r1, [multiple_grid_const_escape_param_1];
 ; PTX-NEXT:    mov.b64 %rd3, multiple_grid_const_escape_param_2;
 ; PTX-NEXT:    cvta.param.u64 %rd4, %rd3;
 ; PTX-NEXT:    cvta.param.u64 %rd5, %rd2;
 ; PTX-NEXT:    add.u64 %rd6, %SP, 0;
 ; PTX-NEXT:    add.u64 %rd7, %SPL, 0;
-; PTX-NEXT:    st.local.u32 [%rd7], %r1;
+; PTX-NEXT:    st.local.b32 [%rd7], %r1;
 ; PTX-NEXT:    mov.b64 %rd1, escape3;
 ; PTX-NEXT:    { // callseq 1, 0
 ; PTX-NEXT:    .param .b64 param0;
@@ -216,10 +216,10 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.b64 %rd1, grid_const_memory_escape_param_0;
-; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_memory_escape_param_1];
+; PTX-NEXT:    ld.param.b64 %rd2, [grid_const_memory_escape_param_1];
 ; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
 ; PTX-NEXT:    cvta.param.u64 %rd4, %rd1;
-; PTX-NEXT:    st.global.u64 [%rd3], %rd4;
+; PTX-NEXT:    st.global.b64 [%rd3], %rd4;
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
@@ -238,14 +238,14 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.b64 %rd4, grid_const_inlineasm_escape_param_0;
-; PTX-NEXT:    ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1];
+; PTX-NEXT:    ld.param.b64 %rd5, [grid_const_inlineasm_escape_param_1];
 ; PTX-NEXT:    cvta.to.global.u64 %rd6, %rd5;
 ; PTX-NEXT:    cvta.param.u64 %rd2, %rd4;
 ; PTX-NEXT:    add.s64 %rd3, %rd2, 4;
 ; PTX-NEXT:    // begin inline asm
 ; PTX-NEXT:    add.s64 %rd1, %rd2, %rd3;
 ; PTX-NEXT:    // end inline asm
-; PTX-NEXT:    st.global.u64 [%rd6], %rd1;
+; PTX-NEXT:    st.global.b64 [%rd6], %rd1;
 ; PTX-NEXT:    ret;
 ; PTX-NOT      .local
 ; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape(
@@ -272,12 +272,12 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.b64 %rd2, grid_const_partial_escape_param_0;
-; PTX-NEXT:    ld.param.u64 %rd3, [grid_const_partial_escape_param_1];
+; PTX-NEXT:    ld.param.b64 %rd3, [grid_const_partial_escape_param_1];
 ; PTX-NEXT:    cvta.to.global.u64 %rd4, %rd3;
 ; PTX-NEXT:    cvta.param.u64 %rd5, %rd2;
-; PTX-NEXT:    ld.param.u32 %r1, [grid_const_partial_escape_param_0];
+; PTX-NEXT:    ld.param.b32 %r1, [grid_const_partial_escape_param_0];
 ; PTX-NEXT:    add.s32 %r2, %r1, %r1;
-; PTX-NEXT:    st.global.u32 [%rd4], %r2;
+; PTX-NEXT:    st.global.b32 [%rd4], %r2;
 ; PTX-NEXT:    mov.b64 %rd1, escape;
 ; PTX-NEXT:    { // callseq 2, 0
 ; PTX-NEXT:    .param .b64 param0;
@@ -317,12 +317,12 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input,
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.b64 %rd2, grid_const_partial_escapemem_param_0;
-; PTX-NEXT:    ld.param.u64 %rd3, [grid_const_partial_escapemem_param_1];
+; PTX-NEXT:    ld.param.b64 %rd3, [grid_const_partial_escapemem_param_1];
 ; PTX-NEXT:    cvta.to.global.u64 %rd4, %rd3;
 ; PTX-NEXT:    cvta.param.u64 %rd5, %rd2;
-; PTX-NEXT:    ld.param.u32 %r1, [grid_const_partial_escapemem_param_0];
-; PTX-NEXT:    ld.param.u32 %r2, [grid_const_partial_escapemem_param_0+4];
-; PTX-NEXT:    st.global.u64 [%rd4], %rd5;
+; PTX-NEXT:    ld.param.b32 %r1, [grid_const_partial_escapemem_param_0];
+; PTX-NEXT:    ld.param.b32 %r2, [grid_const_partial_escapemem_param_0+4];
+; PTX-NEXT:    st.global.b64 [%rd4], %rd5;
 ; PTX-NEXT:    add.s32 %r3, %r1, %r2;
 ; PTX-NEXT:    mov.b64 %rd1, escape;
 ; PTX-NEXT:    { // callseq 3, 0
@@ -371,16 +371,16 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.b64 %rd6, grid_const_phi_param_0;
-; PTX-NEXT:    ld.param.u64 %rd5, [grid_const_phi_param_1];
+; PTX-NEXT:    ld.param.b64 %rd5, [grid_const_phi_param_1];
 ; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd5;
-; PTX-NEXT:    ld.global.u32 %r1, [%rd1];
+; PTX-NEXT:    ld.global.b32 %r1, [%rd1];
 ; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
 ; PTX-NEXT:    @%p1 bra $L__BB9_2;
 ; PTX-NEXT:  // %bb.1: // %second
 ; PTX-NEXT:    add.s64 %rd6, %rd6, 4;
 ; PTX-NEXT:  $L__BB9_2: // %merge
-; PTX-NEXT:    ld.param.u32 %r2, [%rd6];
-; PTX-NEXT:    st.global.u32 [%rd1], %r2;
+; PTX-NEXT:    ld.param.b32 %r2, [%rd6];
+; PTX-NEXT:    st.global.b32 [%rd1], %r2;
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_phi(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
@@ -427,17 +427,17 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1,
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.b64 %rd7, grid_const_phi_ngc_param_0;
-; PTX-NEXT:    ld.param.u64 %rd6, [grid_const_phi_ngc_param_2];
+; PTX-NEXT:    ld.param.b64 %rd6, [grid_const_phi_ngc_param_2];
 ; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd6;
-; PTX-NEXT:    ld.global.u32 %r1, [%rd1];
+; PTX-NEXT:    ld.global.b32 %r1, [%rd1];
 ; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
 ; PTX-NEXT:    @%p1 bra $L__BB10_2;
 ; PTX-NEXT:  // %bb.1: // %second
 ; PTX-NEXT:    mov.b64 %rd2, grid_const_phi_ngc_param_1;
 ; PTX-NEXT:    add.s64 %rd7, %rd2, 4;
 ; PTX-NEXT:  $L__BB10_2: // %merge
-; PTX-NEXT:    ld.param.u32 %r2, [%rd7];
-; PTX-NEXT:    st.global.u32 [%rd1], %r2;
+; PTX-NEXT:    ld.param.b32 %r2, [%rd7];
+; PTX-NEXT:    st.global.b32 [%rd1], %r2;
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
@@ -485,14 +485,14 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.b64 %rd1, grid_const_select_param_0;
-; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_select_param_2];
+; PTX-NEXT:    ld.param.b64 %rd2, [grid_const_select_param_2];
 ; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
 ; PTX-NEXT:    mov.b64 %rd4, grid_const_select_param_1;
-; PTX-NEXT:    ld.global.u32 %r1, [%rd3];
+; PTX-NEXT:    ld.global.b32 %r1, [%rd3];
 ; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
 ; PTX-NEXT:    selp.b64 %rd5, %rd1, %rd4, %p1;
-; PTX-NEXT:    ld.param.u32 %r2, [%rd5];
-; PTX-NEXT:    st.global.u32 [%rd3], %r2;
+; PTX-NEXT:    ld.param.b32 %r2, [%rd5];
+; PTX-NEXT:    st.global.b32 [%rd3], %r2;
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_select(
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
@@ -522,7 +522,7 @@ define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.b64 %rd1, grid_const_ptrtoint_param_0;
-; PTX-NEXT:    ld.param.u32 %r1, [grid_const_ptrtoint_param_0];
+; PTX-NEXT:    ld.param.b32 %r1, [grid_const_ptrtoint_param_0];
 ; PTX-NEXT:    cvta.param.u64 %rd2, %rd1;
 ; PTX-NEXT:    cvt.u32.u64 %r2, %rd2;
 ; PTX-NEXT:    add.s32 %r3, %r1, %r2;
@@ -557,7 +557,7 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
 ; PTX-NEXT:    .reg .b32 %r<2>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.param.u32 %r1, [test_forward_byval_arg_param_0];
+; PTX-NEXT:    ld.param.b32 %r1, [test_forward_byval_arg_param_0];
 ; PTX-NEXT:    { // callseq 4, 0
 ; PTX-NEXT:    .param .align 4 .b8 param0[4];
 ; PTX-NEXT:    st.param.b32 [param0], %r1;
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index 8e879871e295b..246408ecf6a3a 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -35,14 +35,14 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.b64 %rd1, load_alignment_param_0;
-; PTX-NEXT:    ld.local.u64 %rd2, [%rd1];
-; PTX-NEXT:    ld.local.u64 %rd3, [%rd1+8];
+; PTX-NEXT:    ld.local.b64 %rd2, [%rd1];
+; PTX-NEXT:    ld.local.b64 %rd3, [%rd1+8];
 ; PTX-NEXT:    add.s64 %rd4, %rd1, 16;
 ; PTX-NEXT:    cvta.local.u64 %rd5, %rd4;
-; PTX-NEXT:    ld.local.u32 %r1, [%rd1+16];
-; PTX-NEXT:    ld.u32 %r2, [%rd2];
+; PTX-NEXT:    ld.local.b32 %r1, [%rd1+16];
+; PTX-NEXT:    ld.b32 %r2, [%rd2];
 ; PTX-NEXT:    add.s32 %r3, %r2, %r1;
-; PTX-NEXT:    st.u32 [%rd3], %r3;
+; PTX-NEXT:    st.b32 [%rd3], %r3;
 ; PTX-NEXT:    { // callseq 0, 0
 ; PTX-NEXT:    .param .b64 param0;
 ; PTX-NEXT:    st.param.b64 [param0], %rd5;
@@ -116,12 +116,12 @@ define ptx_kernel void @ptr_generic(ptr %out, ptr %in) {
 ; PTXC-NEXT:    .reg .b64 %rd<5>;
 ; PTXC-EMPTY:
 ; PTXC-NEXT:  // %bb.0:
-; PTXC-NEXT:    ld.param.u64 %rd1, [ptr_generic_param_0];
-; PTXC-NEXT:    ld.param.u64 %rd2, [ptr_generic_param_1];
+; PTXC-NEXT:    ld.param.b64 %rd1, [ptr_generic_param_0];
+; PTXC-NEXT:    ld.param.b64 %rd2, [ptr_generic_param_1];
 ; PTXC-NEXT:    cvta.to.global.u64 %rd3, %rd2;
 ; PTXC-NEXT:    cvta.to.global.u64 %rd4, %rd1;
-; PTXC-NEXT:    ld.global.u32 %r1, [%rd3];
-; PTXC-NEXT:    st.global.u32 [%rd4], %r1;
+; PTXC-NEXT:    ld.global.b32 %r1, [%rd3];
+; PTXC-NEXT:    st.global.b32 [%rd4], %r1;
 ; PTXC-NEXT:    ret;
 ;
 ; PTXO-LABEL: ptr_generic(
@@ -130,10 +130,10 @@ define ptx_kernel void @ptr_generic(ptr %out, ptr %in) {
 ; PTXO-NEXT:    .reg .b64 %rd<3>;
 ; PTXO-EMPTY:
 ; PTXO-NEXT:  // %bb.0:
-; PTXO-NEXT:    ld.param.u64 %rd1, [ptr_generic_param_0];
-; PTXO-NEXT:    ld.param.u64 %rd2, [ptr_generic_param_1];
-; PTXO-NEXT:    ld.u32 %r1, [%rd2];
-; PTXO-NEXT:    st.u32 [%rd1], %r1;
+; PTXO-NEXT:    ld.param.b64 %rd1, [ptr_generic_param_0];
+; PTXO-NEXT:    ld.param.b64 %rd2, [ptr_generic_param_1];
+; PTXO-NEXT:    ld.b32 %r1, [%rd2];
+; PTXO-NEXT:    st.b32 [%rd1], %r1;
 ; PTXO-NEXT:    ret;
   %v = load i32, ptr  %in, align 4
   store i32 %v, ptr %out, align 4
@@ -153,10 +153,10 @@ define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %
 ; PTX-NEXT:    .reg .b64 %rd<3>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.param.u64 %rd1, [ptr_nongeneric_param_0];
-; PTX-NEXT:    ld.param.u64 %rd2, [ptr_nongeneric_param_1];
-; PTX-NEXT:    ld.shared.u32 %r1, [%rd2];
-; PTX-NEXT:    st.global.u32 [%rd1], %r1;
+; PTX-NEXT:    ld.param.b64 %rd1, [ptr_nongeneric_param_0];
+; PTX-NEXT:    ld.param.b64 %rd2, [ptr_nongeneric_param_1];
+; PTX-NEXT:    ld.shared.b32 %r1, [%rd2];
+; PTX-NEXT:    st.global.b32 [%rd1], %r1;
 ; PTX-NEXT:    ret;
   %v = load i32, ptr addrspace(3) %in, align 4
   store i32 %v, ptr addrspace(1) %out, align 4
@@ -184,10 +184,10 @@ define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
 ; PTXC-NEXT:    .reg .b64 %rd<3>;
 ; PTXC-EMPTY:
 ; PTXC-NEXT:  // %bb.0:
-; PTXC-NEXT:    ld.param.u64 %rd1, [ptr_as_int_param_0];
-; PTXC-NEXT:    ld.param.u32 %r1, [ptr_as_int_param_1];
+; PTXC-NEXT:    ld.param.b64 %rd1, [ptr_as_int_param_0];
+; PTXC-NEXT:    ld.param.b32 %r1, [ptr_as_int_param_1];
 ; PTXC-NEXT:    cvta.to.global.u64 %rd2, %rd1;
-; PTXC-NEXT:    st.global.u32 [%rd2], %r1;
+; PTXC-NEXT:    st.global.b32 [%rd2], %r1;
 ; PTXC-NEXT:    ret;
 ;
 ; PTXO-LABEL: ptr_as_int(
@@ -196,9 +196,9 @@ define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
 ; PTXO-NEXT:    .reg .b64 %rd<2>;
 ; PTXO-EMPTY:
 ; PTXO-NEXT:  // %bb.0:
-; PTXO-NEXT:    ld.param.u64 %rd1, [ptr_as_int_param_0];
-; PTXO-NEXT:    ld.param.u32 %r1, [ptr_as_int_param_1];
-; PTXO-NEXT:    st.u32 [%rd1], %r1;
+; PTXO-NEXT:    ld.param.b64 %rd1, [ptr_as_int_param_0];
+; PTXO-NEXT:    ld.param.b32 %r1, [ptr_as_int_param_1];
+; PTXO-NEXT:    st.b32 [%rd1], %r1;
 ; PTXO-NEXT:    ret;
   %p = inttoptr i64 %i to ptr
   store i32 %v, ptr %p, align 4
@@ -232,10 +232,10 @@ define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%st
 ; PTXC-NEXT:    .reg .b64 %rd<3>;
 ; PTXC-EMPTY:
 ; PTXC-NEXT:  // %bb.0:
-; PTXC-NEXT:    ld.param.u32 %r1, [ptr_as_int_aggr_param_1];
-; PTXC-NEXT:    ld.param.u64 %rd1, [ptr_as_int_aggr_param_0];
+; PTXC-NEXT:    ld.param.b32 %r1, [ptr_as_int_aggr_param_1];
+; PTXC-NEXT:    ld.param.b64 %rd1, [ptr_as_int_aggr_param_0];
 ; PTXC-NEXT:    cvta.to.global.u64 %rd2, %rd1;
-; PTXC-NEXT:    st.global.u32 [%rd2], %r1;
+; PTXC-NEXT:    st.global.b32 [%rd2], %r1;
 ; PTXC-NEXT:    ret;
 ;
 ; PTXO-LABEL: ptr_as_int_aggr(
@@ -244,9 +244,9 @@ define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%st
 ; PTXO-NEXT:    .reg .b64 %rd<2>;
 ; PTXO-EMPTY:
 ; PTXO-NEXT:  // %bb.0:
-; PTXO-NEXT:    ld.param.u32 %r1, [ptr_as_int_aggr_param_1];
-; PTXO-NEXT:    ld.param.u64 %rd1, [ptr_as_int_aggr_param_0];
-; PTXO-NEXT:    st.u32 [%rd1], %r1;
+; PTXO-NEXT:    ld.param.b32 %r1, [ptr_as_int_aggr_param_1];
+; PTXO-NEXT:    ld.param.b64 %rd1, [ptr_as_int_aggr_param_0];
+; PTXO-NEXT:    st.b32 [%rd1], %r1;
 ; PTXO-NEXT:    ret;
   %i = load i64, ptr %s, align 8
   %p = inttoptr i64 %i to ptr
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 1304ffe42c7b5..54495cf0d61f3 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -50,10 +50,10 @@ define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out
 ; PTX-NEXT:    .reg .b64 %rd<3>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %entry
-; PTX-NEXT:    ld.param.u64 %rd1, [read_only_param_0];
+; PTX-NEXT:    ld.param.b64 %rd1, [read_only_param_0];
 ; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
-; PTX-NEXT:    ld.param.u32 %r1, [read_only_param_1];
-; PTX-NEXT:    st.global.u32 [%rd2], %r1;
+; PTX-NEXT:    ld.param.b32 %r1, [read_only_param_1];
+; PTX-NEXT:    st.global.b32 [%rd2], %r1;
 ; PTX-NEXT:    ret;
 entry:
   %i = load i32, ptr %s, align 4
@@ -86,10 +86,10 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly
 ; PTX-NEXT:    .reg .b64 %rd<3>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %entry
-; PTX-NEXT:    ld.param.u64 %rd1, [read_only_gep_param_0];
+; PTX-NEXT:    ld.param.b64 %rd1, [read_only_gep_param_0];
 ; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
-; PTX-NEXT:    ld.param.u32 %r1, [read_only_gep_param_1+4];
-; PTX-NEXT:    st.global.u32 [%rd2], %r1;
+; PTX-NEXT:    ld.param.b32 %r1, [read_only_gep_param_1+4];
+; PTX-NEXT:    st.global.b32 [%rd2], %r1;
 ; PTX-NEXT:    ret;
 entry:
   %b = getelementptr inbounds nuw i8, ptr %s, i64 4
@@ -146,10 +146,10 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out
 ; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; PTX-NEXT:    add.u64 %rd1, %SP, 0;
 ; PTX-NEXT:    add.u64 %rd2, %SPL, 0;
-; PTX-NEXT:    ld.param.u32 %r1, [escape_ptr_param_1+4];
-; PTX-NEXT:    st.local.u32 [%rd2+4], %r1;
-; PTX-NEXT:    ld.param.u32 %r2, [escape_ptr_param_1];
-; PTX-NEXT:    st.local.u32 [%rd2], %r2;
+; PTX-NEXT:    ld.param.b32 %r1, [escape_ptr_param_1+4];
+; PTX-NEXT:    st.local.b32 [%rd2+4], %r1;
+; PTX-NEXT:    ld.param.b32 %r2, [escape_ptr_param_1];
+; PTX-NEXT:    st.local.b32 [%rd2], %r2;
 ; PTX-NEXT:    { // callseq 0, 0
 ; PTX-NEXT:    .param .b64 param0;
 ; PTX-NEXT:    st.param.b64 [param0], %rd1;
@@ -190,10 +190,10 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone
 ; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; PTX-NEXT:    add.u64 %rd1, %SP, 0;
 ; PTX-NEXT:    add.u64 %rd2, %SPL, 0;
-; PTX-NEXT:    ld.param.u32 %r1, [escape_ptr_gep_param_1+4];
-; PTX-NEXT:    st.local.u32 [%rd2+4], %r1;
-; PTX-NEXT:    ld.param.u32 %r2, [escape_ptr_gep_param_1];
-; PTX-NEXT:    st.local.u32 [%rd2], %r2;
+; PTX-NEXT:    ld.param.b32 %r1, [escape_ptr_gep_param_1+4];
+; PTX-NEXT:    st.local.b32 [%rd2+4], %r1;
+; PTX-NEXT:    ld.param.b32 %r2, [escape_ptr_gep_param_1];
+; PTX-NEXT:    st.local.b32 [%rd2], %r2;
 ; PTX-NEXT:    add.s64 %rd3, %rd1, 4;
 ; PTX-NEXT:    { // callseq 1, 0
 ; PTX-NEXT:    .param .b64 param0;
@@ -233,15 +233,15 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.b64 %SPL, __local_depot4;
 ; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
-; PTX-NEXT:    ld.param.u64 %rd1, [escape_ptr_store_param_0];
+; PTX-NEXT:    ld.param.b64 %rd1, [escape_ptr_store_param_0];
 ; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
 ; PTX-NEXT:    add.u64 %rd3, %SP, 0;
 ; PTX-NEXT:    add.u64 %rd4, %SPL, 0;
-; PTX-NEXT:    ld.param.u32 %r1, [escape_ptr_store_param_1+4];
-; PTX-NEXT:    st.local.u32 [%rd4+4], %r1;
-; PTX-NEXT:    ld.param.u32 %r2, [escape_ptr_store_param_1];
-; PTX-NEXT:    st.local.u32 [%rd4], %r2;
-; PTX-NEXT:    st.global.u64 [%rd2], %rd3;
+; PTX-NEXT:    ld.param.b32 %r1, [escape_ptr_store_param_1+4];
+; PTX-NEXT:    st.local.b32 [%rd4+4], %r1;
+; PTX-NEXT:    ld.param.b32 %r2, [escape_ptr_store_param_1];
+; PTX-NEXT:    st.local.b32 [%rd4], %r2;
+; PTX-NEXT:    st.global.b64 [%rd2], %rd3;
 ; PTX-NEXT:    ret;
 entry:
   store ptr %s, ptr %out, align 8
@@ -271,16 +271,16 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.b64 %SPL, __local_depot5;
 ; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
-; PTX-NEXT:    ld.param.u64 %rd1, [escape_ptr_gep_store_param_0];
+; PTX-NEXT:    ld.param.b64 %rd1, [escape_ptr_gep_store_param_0];
 ; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
 ; PTX-NEXT:    add.u64 %rd3, %SP, 0;
 ; PTX-NEXT:    add.u64 %rd4, %SPL, 0;
-; PTX-NEXT:    ld.param.u32 %r1, [escape_ptr_gep_store_param_1+4];
-; PTX-NEXT:    st.local.u32 [%rd4+4], %r1;
-; PTX-NEXT:    ld.param.u32 %r2, [escape_ptr_gep_store_param_1];
-; PTX-NEXT:    st.local.u32 [%rd4], %r2;
+; PTX-NEXT:    ld.param.b32 %r1, [escape_ptr_gep_store_param_1+4];
+; PTX-NEXT:    st.local.b32 [%rd4+4], %r1;
+; PTX-NEXT:    ld.param.b32 %r2, [escape_ptr_gep_store_param_1];
+; PTX-NEXT:    st.local.b32 [%rd4], %r2;
 ; PTX-NEXT:    add.s64 %rd5, %rd3, 4;
-; PTX-NEXT:    st.global.u64 [%rd2], %rd5;
+; PTX-NEXT:    st.global.b64 [%rd2], %rd5;
 ; PTX-NEXT:    ret;
 entry:
   %b = getelementptr inbounds nuw i8, ptr %s, i64 4
@@ -311,15 +311,15 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.b64 %SPL, __local_depot6;
 ; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
-; PTX-NEXT:    ld.param.u64 %rd1, [escape_ptrtoint_param_0];
+; PTX-NEXT:    ld.param.b64 %rd1, [escape_ptrtoint_param_0];
 ; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
 ; PTX-NEXT:    add.u64 %rd3, %SP, 0;
 ; PTX-NEXT:    add.u64 %rd4, %SPL, 0;
-; PTX-NEXT:    ld.param.u32 %r1, [escape_ptrtoint_param_1+4];
-; PTX-NEXT:    st.local.u32 [%rd4+4], %r1;
-; PTX-NEXT:    ld.param.u32 %r2, [escape_ptrtoint_param_1];
-; PTX-NEXT:    st.local.u32 [%rd4], %r2;
-; PTX-NEXT:    st.global.u64 [%rd2], %rd3;
+; PTX-NEXT:    ld.param.b32 %r1, [escape_ptrtoint_param_1+4];
+; PTX-NEXT:    st.local.b32 [%rd4+4], %r1;
+; PTX-NEXT:    ld.param.b32 %r2, [escape_ptrtoint_param_1];
+; PTX-NEXT:    st.local.b32 [%rd4], %r2;
+; PTX-NEXT:    st.global.b64 [%rd2], %rd3;
 ; PTX-NEXT:    ret;
 entry:
   %i = ptrtoint ptr %s to i64
@@ -348,39 +348,39 @@ define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeo
 ; PTX-NEXT:    .reg .b64 %rd<2>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %entry
-; PTX-NEXT:    ld.param.u64 %rd1, [memcpy_from_param_param_0];
-; PTX-NEXT:    ld.param.u8 %rs1, [memcpy_from_param_param_1+15];
-; PTX-NEXT:    st.volatile.u8 [%rd1+15], %rs1;
-; PTX-NEXT:    ld.param.u8 %rs2, [memcpy_from_param_param_1+14];
-; PTX-NEXT:    st.volatile.u8 [%rd1+14], %rs2;
-; PTX-NEXT:    ld.param.u8 %rs3, [memcpy_from_param_param_1+13];
-; PTX-NEXT:    st.volatile.u8 [%rd1+13], %rs3;
-; PTX-NEXT:    ld.param.u8 %rs4, [memcpy_from_param_param_1+12];
-; PTX-NEXT:    st.volatile.u8 [%rd1+12], %rs4;
-; PTX-NEXT:    ld.param.u8 %rs5, [memcpy_from_param_param_1+11];
-; PTX-NEXT:    st.volatile.u8 [%rd1+11], %rs5;
-; PTX-NEXT:    ld.param.u8 %rs6, [memcpy_from_param_param_1+10];
-; PTX-NEXT:    st.volatile.u8 [%rd1+10], %rs6;
-; PTX-NEXT:    ld.param.u8 %rs7, [memcpy_from_param_param_1+9];
-; PTX-NEXT:    st.volatile.u8 [%rd1+9], %rs7;
-; PTX-NEXT:    ld.param.u8 %rs8, [memcpy_from_param_param_1+8];
-; PTX-NEXT:    st.volatile.u8 [%rd1+8], %rs8;
-; PTX-NEXT:    ld.param.u8 %rs9, [memcpy_from_param_param_1+7];
-; PTX-NEXT:    st.volatile.u8 [%rd1+7], %rs9;
-; PTX-NEXT:    ld.param.u8 %rs10, [memcpy_from_param_param_1+6];
-; PTX-NEXT:    st.volatile.u8 [%rd1+6], %rs10;
-; PTX-NEXT:    ld.param.u8 %rs11, [memcpy_from_param_param_1+5];
-; PTX-NEXT:    st.volatile.u8 [%rd1+5], %rs11;
-; PTX-NEXT:    ld.param.u8 %rs12, [memcpy_from_param_param_1+4];
-; PTX-NEXT:    st.volatile.u8 [%rd1+4], %rs12;
-; PTX-NEXT:    ld.param.u8 %rs13, [memcpy_from_param_param_1+3];
-; PTX-NEXT:    st.volatile.u8 [%rd1+3], %rs13;
-; PTX-NEXT:    ld.param.u8 %rs14, [memcpy_from_param_param_1+2];
-; PTX-NEXT:    st.volatile.u8 [%rd1+2], %rs14;
-; PTX-NEXT:    ld.param.u8 %rs15, [memcpy_from_param_param_1+1];
-; PTX-NEXT:    st.volatile.u8 [%rd1+1], %rs15;
-; PTX-NEXT:    ld.param.u8 %rs16, [memcpy_from_param_param_1];
-; PTX-NEXT:    st.volatile.u8 [%rd1], %rs16;
+; PTX-NEXT:    ld.param.b64 %rd1, [memcpy_from_param_param_0];
+; PTX-NEXT:    ld.param.b8 %rs1, [memcpy_from_param_param_1+15];
+; PTX-NEXT:    st.volatile.b8 [%rd1+15], %rs1;
+; PTX-NEXT:    ld.param.b8 %rs2, [memcpy_from_param_param_1+14];
+; PTX-NEXT:    st.volatile.b8 [%rd1+14], %rs2;
+; PTX-NEXT:    ld.param.b8 %rs3, [memcpy_from_param_param_1+13];
+; PTX-NEXT:    st.volatile.b8 [%rd1+13], %rs3;
+; PTX-NEXT:    ld.param.b8 %rs4, [memcpy_from_param_param_1+12];
+; PTX-NEXT:    st.volatile.b8 [%rd1+12], %rs4;
+; PTX-NEXT:    ld.param.b8 %rs5, [memcpy_from_param_param_1+11];
+; PTX-NEXT:    st.volatile.b8 [%rd1+11], %rs5;
+; PTX-NEXT:    ld.param.b8 %rs6, [memcpy_from_param_param_1+10];
+; PTX-NEXT:    st.volatile.b8 [%rd1+10], %rs6;
+; PTX-NEXT:    ld.param.b8 %rs7, [memcpy_from_param_param_1+9];
+; PTX-NEXT:    st.volatile.b8 [%rd1+9], %rs7;
+; PTX-NEXT:    ld.param.b8 %rs8, [memcpy_from_param_param_1+8];
+; PTX-NEXT:    st.volatile.b8 [%rd1+8], %rs8;
+; PTX-NEXT:    ld.param.b8 %rs9, [memcpy_from_param_param_1+7];
+; PTX-NEXT:    st.volatile.b8 [%rd1+7], %rs9;
+; PTX-NEXT:    ld.param.b8 %rs10, [memcpy_from_param_param_1+6];
+; PTX-NEXT:    st.volatile.b8 [%rd1+6], %rs10;
+; PTX-NEXT:    ld.param.b8 %rs11, [memcpy_from_param_param_1+5];
+; PTX-NEXT:    st.volatile.b8 [%rd1+5], %rs11;
+; PTX-NEXT:    ld.param.b8 %rs12, [memcpy_from_param_param_1+4];
+; PTX-NEXT:    st.volatile.b8 [%rd1+4], %rs12;
+; PTX-NEXT:    ld.param.b8 %rs13, [memcpy_from_param_param_1+3];
+; PTX-NEXT:    st.volatile.b8 [%rd1+3], %rs13;
+; PTX-NEXT:    ld.param.b8 %rs14, [memcpy_from_param_param_1+2];
+; PTX-NEXT:    st.volatile.b8 [%rd1+2], %rs14;
+; PTX-NEXT:    ld.param.b8 %rs15, [memcpy_from_param_param_1+1];
+; PTX-NEXT:    st.volatile.b8 [%rd1+1], %rs15;
+; PTX-NEXT:    ld.param.b8 %rs16, [memcpy_from_param_param_1];
+; PTX-NEXT:    st.volatile.b8 [%rd1], %rs16;
 ; PTX-NEXT:    ret;
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true)
@@ -408,39 +408,39 @@ define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture nound
 ; PTX-NEXT:    .reg .b64 %rd<2>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %entry
-; PTX-NEXT:    ld.param.u64 %rd1, [memcpy_from_param_noalign_param_0];
-; PTX-NEXT:    ld.param.u8 %rs1, [memcpy_from_param_noalign_param_1+15];
-; PTX-NEXT:    st.volatile.u8 [%rd1+15], %rs1;
-; PTX-NEXT:    ld.param.u8 %rs2, [memcpy_from_param_noalign_param_1+14];
-; PTX-NEXT:    st.volatile.u8 [%rd1+14], %rs2;
-; PTX-NEXT:    ld.param.u8 %rs3, [memcpy_from_param_noalign_param_1+13];
-; PTX-NEXT:    st.volatile.u8 [%rd1+13], %rs3;
-; PTX-NEXT:    ld.param.u8 %rs4, [memcpy_from_param_noalign_param_1+12];
-; PTX-NEXT:    st.volatile.u8 [%rd1+12], %rs4;
-; PTX-NEXT:    ld.param.u8 %rs5, [memcpy_from_param_noalign_param_1+11];
-; PTX-NEXT:    st.volatile.u8 [%rd1+11], %rs5;
-; PTX-NEXT:    ld.param.u8 %rs6, [memcpy_from_param_noalign_param_1+10];
-; PTX-NEXT:    st.volatile.u8 [%rd1+10], %rs6;
-; PTX-NEXT:    ld.param.u8 %rs7, [memcpy_from_param_noalign_param_1+9];
-; PTX-NEXT:    st.volatile.u8 [%rd1+9], %rs7;
-; PTX-NEXT:    ld.param.u8 %rs8, [memcpy_from_param_noalign_param_1+8];
-; PTX-NEXT:    st.volatile.u8 [%rd1+8], %rs8;
-; PTX-NEXT:    ld.param.u8 %rs9, [memcpy_from_param_noalign_param_1+7];
-; PTX-NEXT:    st.volatile.u8 [%rd1+7], %rs9;
-; PTX-NEXT:    ld.param.u8 %rs10, [memcpy_from_param_noalign_param_1+6];
-; PTX-NEXT:    st.volatile.u8 [%rd1+6], %rs10;
-; PTX-NEXT:    ld.param.u8 %rs11, [memcpy_from_param_noalign_param_1+5];
-; PTX-NEXT:    st.volatile.u8 [%rd1+5], %rs11;
-; PTX-NEXT:    ld.param.u8 %rs12, [memcpy_from_param_noalign_param_1+4];
-; PTX-NEXT:    st.volatile.u8 [%rd1+4], %rs12;
-; PTX-NEXT:    ld.param.u8 %rs13, [memcpy_from_param_noalign_param_1+3];
-; PTX-NEXT:    st.volatile.u8 [%rd1+3], %rs13;
-; PTX-NEXT:    ld.param.u8 %rs14, [memcpy_from_param_noalign_param_1+2];
-; PTX-NEXT:    st.volatile.u8 [%rd1+2], %rs14;
-; PTX-NEXT:    ld.param.u8 %rs15, [memcpy_from_param_noalign_param_1+1];
-; PTX-NEXT:    st.volatile.u8 [%rd1+1], %rs15;
-; PTX-NEXT:    ld.param.u8 %rs16, [memcpy_from_param_noalign_param_1];
-; PTX-NEXT:    st.volatile.u8 [%rd1], %rs16;
+; PTX-NEXT:    ld.param.b64 %rd1, [memcpy_from_param_noalign_param_0];
+; PTX-NEXT:    ld.param.b8 %rs1, [memcpy_from_param_noalign_param_1+15];
+; PTX-NEXT:    st.volatile.b8 [%rd1+15], %rs1;
+; PTX-NEXT:    ld.param.b8 %rs2, [memcpy_from_param_noalign_param_1+14];
+; PTX-NEXT:    st.volatile.b8 [%rd1+14], %rs2;
+; PTX-NEXT:    ld.param.b8 %rs3, [memcpy_from_param_noalign_param_1+13];
+; PTX-NEXT:    st.volatile.b8 [%rd1+13], %rs3;
+; PTX-NEXT:    ld.param.b8 %rs4, [memcpy_from_param_noalign_param_1+12];
+; PTX-NEXT:    st.volatile.b8 [%rd1+12], %rs4;
+; PTX-NEXT:    ld.param.b8 %rs5, [memcpy_from_param_noalign_param_1+11];
+; PTX-NEXT:    st.volatile.b8 [%rd1+11], %rs5;
+; PTX-NEXT:    ld.param.b8 %rs6, [memcpy_from_param_noalign_param_1+10];
+; PTX-NEXT:    st.volatile.b8 [%rd1+10], %rs6;
+; PTX-NEXT:    ld.param.b8 %rs7, [memcpy_from_param_noalign_param_1+9];
+; PTX-NEXT:    st.volatile.b8 [%rd1+9], %rs7;
+; PTX-NEXT:    ld.param.b8 %rs8, [memcpy_from_param_noalign_param_1+8];
+; PTX-NEXT:    st.volatile.b8 [%rd1+8], %rs8;
+; PTX-NEXT:    ld.param.b8 %rs9, [memcpy_from_param_noalign_param_1+7];
+; PTX-NEXT:    st.volatile.b8 [%rd1+7], %rs9;
+; PTX-NEXT:    ld.param.b8 %rs10, [memcpy_from_param_noalign_param_1+6];
+; PTX-NEXT:    st.volatile.b8 [%rd1+6], %rs10;
+; PTX-NEXT:    ld.param.b8 %rs11, [memcpy_from_param_noalign_param_1+5];
+; PTX-NEXT:    st.volatile.b8 [%rd1+5], %rs11;
+; PTX-NEXT:    ld.param.b8 %rs12, [memcpy_from_param_noalign_param_1+4];
+; PTX-NEXT:    st.volatile.b8 [%rd1+4], %rs12;
+; PTX-NEXT:    ld.param.b8 %rs13, [memcpy_from_param_noalign_param_1+3];
+; PTX-NEXT:    st.volatile.b8 [%rd1+3], %rs13;
+; PTX-NEXT:    ld.param.b8 %rs14, [memcpy_from_param_noalign_param_1+2];
+; PTX-NEXT:    st.volatile.b8 [%rd1+2], %rs14;
+; PTX-NEXT:    ld.param.b8 %rs15, [memcpy_from_param_noalign_param_1+1];
+; PTX-NEXT:    st.volatile.b8 [%rd1+1], %rs15;
+; PTX-NEXT:    ld.param.b8 %rs16, [memcpy_from_param_noalign_param_1];
+; PTX-NEXT:    st.volatile.b8 [%rd1], %rs16;
 ; PTX-NEXT:    ret;
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true)
@@ -469,58 +469,58 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.b64 %SPL, __local_depot9;
 ; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
-; PTX-NEXT:    ld.param.u64 %rd1, [memcpy_to_param_param_0];
+; PTX-NEXT:    ld.param.b64 %rd1, [memcpy_to_param_param_0];
 ; PTX-NEXT:    add.u64 %rd3, %SPL, 0;
-; PTX-NEXT:    ld.param.u32 %r1, [memcpy_to_param_param_1+4];
-; PTX-NEXT:    st.local.u32 [%rd3+4], %r1;
-; PTX-NEXT:    ld.param.u32 %r2, [memcpy_to_param_param_1];
-; PTX-NEXT:    st.local.u32 [%rd3], %r2;
-; PTX-NEXT:    ld.volatile.u8 %rd4, [%rd1];
-; PTX-NEXT:    ld.volatile.u8 %rd5, [%rd1+1];
+; PTX-NEXT:    ld.param.b32 %r1, [memcpy_to_param_param_1+4];
+; PTX-NEXT:    st.local.b32 [%rd3+4], %r1;
+; PTX-NEXT:    ld.param.b32 %r2, [memcpy_to_param_param_1];
+; PTX-NEXT:    st.local.b32 [%rd3], %r2;
+; PTX-NEXT:    ld.volatile.b8 %rd4, [%rd1];
+; PTX-NEXT:    ld.volatile.b8 %rd5, [%rd1+1];
 ; PTX-NEXT:    shl.b64 %rd6, %rd5, 8;
 ; PTX-NEXT:    or.b64 %rd7, %rd6, %rd4;
-; PTX-NEXT:    ld.volatile.u8 %rd8, [%rd1+2];
+; PTX-NEXT:    ld.volatile.b8 %rd8, [%rd1+2];
 ; PTX-NEXT:    shl.b64 %rd9, %rd8, 16;
-; PTX-NEXT:    ld.volatile.u8 %rd10, [%rd1+3];
+; PTX-NEXT:    ld.volatile.b8 %rd10, [%rd1+3];
 ; PTX-NEXT:    shl.b64 %rd11, %rd10, 24;
 ; PTX-NEXT:    or.b64 %rd12, %rd11, %rd9;
 ; PTX-NEXT:    or.b64 %rd13, %rd12, %rd7;
-; PTX-NEXT:    ld.volatile.u8 %rd14, [%rd1+4];
-; PTX-NEXT:    ld.volatile.u8 %rd15, [%rd1+5];
+; PTX-NEXT:    ld.volatile.b8 %rd14, [%rd1+4];
+; PTX-NEXT:    ld.volatile.b8 %rd15, [%rd1+5];
 ; PTX-NEXT:    shl.b64 %rd16, %rd15, 8;
 ; PTX-NEXT:    or.b64 %rd17, %rd16, %rd14;
-; PTX-NEXT:    ld.volatile.u8 %rd18, [%rd1+6];
+; PTX-NEXT:    ld.volatile.b8 %rd18, [%rd1+6];
 ; PTX-NEXT:    shl.b64 %rd19, %rd18, 16;
-; PTX-NEXT:    ld.volatile.u8 %rd20, [%rd1+7];
+; PTX-NEXT:    ld.volatile.b8 %rd20, [%rd1+7];
 ; PTX-NEXT:    shl.b64 %rd21, %rd20, 24;
 ; PTX-NEXT:    or.b64 %rd22, %rd21, %rd19;
 ; PTX-NEXT:    or.b64 %rd23, %rd22, %rd17;
 ; PTX-NEXT:    shl.b64 %rd24, %rd23, 32;
 ; PTX-NEXT:    or.b64 %rd25, %rd24, %rd13;
-; PTX-NEXT:    st.volatile.u64 [%SP], %rd25;
-; PTX-NEXT:    ld.volatile.u8 %rd26, [%rd1+8];
-; PTX-NEXT:    ld.volatile.u8 %rd27, [%rd1+9];
+; PTX-NEXT:    st.volatile.b64 [%SP], %rd25;
+; PTX-NEXT:    ld.volatile.b8 %rd26, [%rd1+8];
+; PTX-NEXT:    ld.volatile.b8 %rd27, [%rd1+9];
 ; PTX-NEXT:    shl.b64 %rd28, %rd27, 8;
 ; PTX-NEXT:    or.b64 %rd29, %rd28, %rd26;
-; PTX-NEXT:    ld.volatile.u8 %rd30, [%rd1+10];
+; PTX-NEXT:    ld.volatile.b8 %rd30, [%rd1+10];
 ; PTX-NEXT:    shl.b64 %rd31, %rd30, 16;
-; PTX-NEXT:    ld.volatile.u8 %rd32, [%rd1+11];
+; PTX-NEXT:    ld.volatile.b8 %rd32, [%rd1+11];
 ; PTX-NEXT:    shl.b64 %rd33, %rd32, 24;
 ; PTX-NEXT:    or.b64 %rd34, %rd33, %rd31;
 ; PTX-NEXT:    or.b64 %rd35, %rd34, %rd29;
-; PTX-NEXT:    ld.volatile.u8 %rd36, [%rd1+12];
-; PTX-NEXT:    ld.volatile.u8 %rd37, [%rd1+13];
+; PTX-NEXT:    ld.volatile.b8 %rd36, [%rd1+12];
+; PTX-NEXT:    ld.volatile.b8 %rd37, [%rd1+13];
 ; PTX-NEXT:    shl.b64 %rd38, %rd37, 8;
 ; PTX-NEXT:    or.b64 %rd39, %rd38, %rd36;
-; PTX-NEXT:    ld.volatile.u8 %rd40, [%rd1+14];
+; PTX-NEXT:    ld.volatile.b8 %rd40, [%rd1+14];
 ; PTX-NEXT:    shl.b64 %rd41, %rd40, 16;
-; PTX-NEXT:    ld.volatile.u8 %rd42, [%rd1+15];
+; PTX-NEXT:    ld.volatile.b8 %rd42, [%rd1+15];
 ; PTX-NEXT:    shl.b64 %rd43, %rd42, 24;
 ; PTX-NEXT:    or.b64 %rd44, %rd43, %rd41;
 ; PTX-NEXT:    or.b64 %rd45, %rd44, %rd39;
 ; PTX-NEXT:    shl.b64 %rd46, %rd45, 32;
 ; PTX-NEXT:    or.b64 %rd47, %rd46, %rd35;
-; PTX-NEXT:    st.volatile.u64 [%SP+8], %rd47;
+; PTX-NEXT:    st.volatile.b64 [%SP+8], %rd47;
 ; PTX-NEXT:    ret;
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true)
@@ -600,15 +600,15 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
 ; PTX_60-NEXT:    .reg .b64 %rd<3>;
 ; PTX_60-EMPTY:
 ; PTX_60-NEXT:  // %bb.0: // %bb
-; PTX_60-NEXT:    ld.param.u8 %rs1, [test_select_param_3];
+; PTX_60-NEXT:    ld.param.b8 %rs1, [test_select_param_3];
 ; PTX_60-NEXT:    and.b16 %rs2, %rs1, 1;
 ; PTX_60-NEXT:    setp.ne.b16 %p1, %rs2, 0;
-; PTX_60-NEXT:    ld.param.u64 %rd1, [test_select_param_2];
+; PTX_60-NEXT:    ld.param.b64 %rd1, [test_select_param_2];
 ; PTX_60-NEXT:    cvta.to.global.u64 %rd2, %rd1;
-; PTX_60-NEXT:    ld.param.u32 %r1, [test_select_param_1];
-; PTX_60-NEXT:    ld.param.u32 %r2, [test_select_param_0];
+; PTX_60-NEXT:    ld.param.b32 %r1, [test_select_param_1];
+; PTX_60-NEXT:    ld.param.b32 %r2, [test_select_param_0];
 ; PTX_60-NEXT:    selp.b32 %r3, %r2, %r1, %p1;
-; PTX_60-NEXT:    st.global.u32 [%rd2], %r3;
+; PTX_60-NEXT:    st.global.b32 [%rd2], %r3;
 ; PTX_60-NEXT:    ret;
 ;
 ; PTX_70-LABEL: test_select(
@@ -619,16 +619,16 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
 ; PTX_70-NEXT:    .reg .b64 %rd<6>;
 ; PTX_70-EMPTY:
 ; PTX_70-NEXT:  // %bb.0: // %bb
-; PTX_70-NEXT:    ld.param.u8 %rs1, [test_select_param_3];
+; PTX_70-NEXT:    ld.param.b8 %rs1, [test_select_param_3];
 ; PTX_70-NEXT:    and.b16 %rs2, %rs1, 1;
 ; PTX_70-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; PTX_70-NEXT:    mov.b64 %rd1, test_select_param_0;
-; PTX_70-NEXT:    ld.param.u64 %rd2, [test_select_param_2];
+; PTX_70-NEXT:    ld.param.b64 %rd2, [test_select_param_2];
 ; PTX_70-NEXT:    cvta.to.global.u64 %rd3, %rd2;
 ; PTX_70-NEXT:    mov.b64 %rd4, test_select_param_1;
 ; PTX_70-NEXT:    selp.b64 %rd5, %rd1, %rd4, %p1;
-; PTX_70-NEXT:    ld.param.u32 %r1, [%rd5];
-; PTX_70-NEXT:    st.global.u32 [%rd3], %r1;
+; PTX_70-NEXT:    ld.param.b32 %r1, [%rd5];
+; PTX_70-NEXT:    st.global.b32 [%rd3], %r1;
 ; PTX_70-NEXT:    ret;
 bb:
   %ptrnew = select i1 %cond, ptr %input1, ptr %input2
@@ -664,18 +664,18 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by
 ; PTX-NEXT:  // %bb.0: // %bb
 ; PTX-NEXT:    mov.b64 %SPL, __local_depot12;
 ; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
-; PTX-NEXT:    ld.param.u8 %rs1, [test_select_write_param_3];
+; PTX-NEXT:    ld.param.b8 %rs1, [test_select_write_param_3];
 ; PTX-NEXT:    and.b16 %rs2, %rs1, 1;
 ; PTX-NEXT:    setp.ne.b16 %p1, %rs2, 0;
-; PTX-NEXT:    ld.param.u32 %r1, [test_select_write_param_1];
-; PTX-NEXT:    st.u32 [%SP], %r1;
-; PTX-NEXT:    ld.param.u32 %r2, [test_select_write_param_0];
-; PTX-NEXT:    st.u32 [%SP+4], %r2;
+; PTX-NEXT:    ld.param.b32 %r1, [test_select_write_param_1];
+; PTX-NEXT:    st.b32 [%SP], %r1;
+; PTX-NEXT:    ld.param.b32 %r2, [test_select_write_param_0];
+; PTX-NEXT:    st.b32 [%SP+4], %r2;
 ; PTX-NEXT:    add.u64 %rd2, %SPL, 4;
 ; PTX-NEXT:    add.u64 %rd4, %SPL, 0;
 ; PTX-NEXT:    selp.b64 %rd5, %rd2, %rd4, %p1;
 ; PTX-NEXT:    mov.b32 %r3, 1;
-; PTX-NEXT:    st.local.u32 [%rd5], %r3;
+; PTX-NEXT:    st.local.b32 [%rd5], %r3;
 ; PTX-NEXT:    ret;
 bb:
   %ptrnew = select i1 %cond, ptr %input1, ptr %input2
@@ -756,17 +756,17 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
 ; PTX_60-NEXT:    .reg .b64 %rd<3>;
 ; PTX_60-EMPTY:
 ; PTX_60-NEXT:  // %bb.0: // %bb
-; PTX_60-NEXT:    ld.param.u8 %rs1, [test_phi_param_3];
+; PTX_60-NEXT:    ld.param.b8 %rs1, [test_phi_param_3];
 ; PTX_60-NEXT:    and.b16 %rs2, %rs1, 1;
 ; PTX_60-NEXT:    setp.ne.b16 %p1, %rs2, 0;
-; PTX_60-NEXT:    ld.param.u64 %rd2, [test_phi_param_2];
+; PTX_60-NEXT:    ld.param.b64 %rd2, [test_phi_param_2];
 ; PTX_60-NEXT:    cvta.to.global.u64 %rd1, %rd2;
-; PTX_60-NEXT:    ld.param.u32 %r4, [test_phi_param_0];
+; PTX_60-NEXT:    ld.param.b32 %r4, [test_phi_param_0];
 ; PTX_60-NEXT:    @%p1 bra $L__BB13_2;
 ; PTX_60-NEXT:  // %bb.1: // %second
-; PTX_60-NEXT:    ld.param.u32 %r4, [test_phi_param_1+4];
+; PTX_60-NEXT:    ld.param.b32 %r4, [test_phi_param_1+4];
 ; PTX_60-NEXT:  $L__BB13_2: // %merge
-; PTX_60-NEXT:    st.global.u32 [%rd1], %r4;
+; PTX_60-NEXT:    st.global.b32 [%rd1], %r4;
 ; PTX_60-NEXT:    ret;
 ;
 ; PTX_70-LABEL: test_phi(
@@ -777,19 +777,19 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
 ; PTX_70-NEXT:    .reg .b64 %rd<8>;
 ; PTX_70-EMPTY:
 ; PTX_70-NEXT:  // %bb.0: // %bb
-; PTX_70-NEXT:    ld.param.u8 %rs1, [test_phi_param_3];
+; PTX_70-NEXT:    ld.param.b8 %rs1, [test_phi_param_3];
 ; PTX_70-NEXT:    and.b16 %rs2, %rs1, 1;
 ; PTX_70-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; PTX_70-NEXT:    mov.b64 %rd7, test_phi_param_0;
-; PTX_70-NEXT:    ld.param.u64 %rd6, [test_phi_param_2];
+; PTX_70-NEXT:    ld.param.b64 %rd6, [test_phi_param_2];
 ; PTX_70-NEXT:    cvta.to.global.u64 %rd1, %rd6;
 ; PTX_70-NEXT:    @%p1 bra $L__BB13_2;
 ; PTX_70-NEXT:  // %bb.1: // %second
 ; PTX_70-NEXT:    mov.b64 %rd2, test_phi_param_1;
 ; PTX_70-NEXT:    add.s64 %rd7, %rd2, 4;
 ; PTX_70-NEXT:  $L__BB13_2: // %merge
-; PTX_70-NEXT:    ld.param.u32 %r1, [%rd7];
-; PTX_70-NEXT:    st.global.u32 [%rd1], %r1;
+; PTX_70-NEXT:    ld.param.b32 %r1, [%rd7];
+; PTX_70-NEXT:    st.global.b32 [%rd1], %r1;
 ; PTX_70-NEXT:    ret;
 bb:
   br i1 %cond, label %first, label %second
@@ -844,21 +844,21 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr
 ; PTX-NEXT:  // %bb.0: // %bb
 ; PTX-NEXT:    mov.b64 %SPL, __local_depot14;
 ; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
-; PTX-NEXT:    ld.param.u8 %rs1, [test_phi_write_param_2];
+; PTX-NEXT:    ld.param.b8 %rs1, [test_phi_write_param_2];
 ; PTX-NEXT:    and.b16 %rs2, %rs1, 1;
 ; PTX-NEXT:    setp.ne.b16 %p1, %rs2, 0;
 ; PTX-NEXT:    add.u64 %rd1, %SPL, 0;
-; PTX-NEXT:    ld.param.u32 %r1, [test_phi_write_param_1+4];
-; PTX-NEXT:    st.u32 [%SP], %r1;
+; PTX-NEXT:    ld.param.b32 %r1, [test_phi_write_param_1+4];
+; PTX-NEXT:    st.b32 [%SP], %r1;
 ; PTX-NEXT:    add.u64 %rd6, %SPL, 4;
-; PTX-NEXT:    ld.param.u32 %r2, [test_phi_write_param_0];
-; PTX-NEXT:    st.u32 [%SP+4], %r2;
+; PTX-NEXT:    ld.param.b32 %r2, [test_phi_write_param_0];
+; PTX-NEXT:    st.b32 [%SP+4], %r2;
 ; PTX-NEXT:    @%p1 bra $L__BB14_2;
 ; PTX-NEXT:  // %bb.1: // %second
 ; PTX-NEXT:    mov.b64 %rd6, %rd1;
 ; PTX-NEXT:  $L__BB14_2: // %merge
 ; PTX-NEXT:    mov.b32 %r3, 1;
-; PTX-NEXT:    st.local.u32 [%rd6], %r3;
+; PTX-NEXT:    st.local.b32 [%rd6], %r3;
 ; PTX-NEXT:    ret;
 bb:
   br i1 %cond, label %first, label %second
@@ -897,8 +897,8 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.b64 %SPL, __local_depot15;
 ; PTX-NEXT:    add.u64 %rd2, %SPL, 0;
-; PTX-NEXT:    ld.param.u32 %r1, [test_forward_byval_arg_param_0];
-; PTX-NEXT:    st.local.u32 [%rd2], %r1;
+; PTX-NEXT:    ld.param.b32 %r1, [test_forward_byval_arg_param_0];
+; PTX-NEXT:    st.local.b32 [%rd2], %r1;
 ; PTX-NEXT:    { // callseq 2, 0
 ; PTX-NEXT:    .param .align 4 .b8 param0[4];
 ; PTX-NEXT:    st.param.b32 [param0], %r1;
@@ -925,7 +925,7 @@ define void @device_func(ptr byval(i32) align 4 %input) {
 ; PTX-NEXT:    .reg .b64 %rd<2>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    ld.param.u32 %r1, [device_func_param_0];
+; PTX-NEXT:    ld.param.b32 %r1, [device_func_param_0];
 ; PTX-NEXT:    { // callseq 3, 0
 ; PTX-NEXT:    .param .align 4 .b8 param0[4];
 ; PTX-NEXT:    st.param.b32 [param0], %r1;
diff --git a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
index 2e64c25594811..5022684adf71c 100644
--- a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
@@ -11,9 +11,9 @@ define ptx_kernel void @kernel(ptr %input, ptr %output) {
 ; CHECK: cvta.to.global.u64
 ; CHECK: cvta.to.global.u64
   %1 = load float, ptr %input, align 4
-; CHECK: ld.global.f32
+; CHECK: ld.global.b32
   store float %1, ptr %output, align 4
-; CHECK: st.global.f32
+; CHECK: st.global.b32
   ret void
 }
 
@@ -21,9 +21,9 @@ define ptx_kernel void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %outpu
 ; CHECK-LABEL: .visible .entry kernel2(
 ; CHECK-NOT: cvta.to.global.u64
   %1 = load float, ptr addrspace(1) %input, align 4
-; CHECK: ld.global.f32
+; CHECK: ld.global.b32
   store float %1, ptr addrspace(1) %output, align 4
-; CHECK: st.global.f32
+; CHECK: st.global.b32
   ret void
 }
 
@@ -31,16 +31,16 @@ define ptx_kernel void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %outpu
 
 define ptx_kernel void @ptr_in_byval_kernel(ptr byval(%struct.S) %input, ptr %output) {
 ; CHECK-LABEL: .visible .entry ptr_in_byval_kernel(
-; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_kernel_param_1]
+; CHECK: ld.param.b64 	%[[optr:rd.*]], [ptr_in_byval_kernel_param_1]
 ; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]];
-; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
+; CHECK: ld.param.b64 	%[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
 ; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]];
   %b_ptr = getelementptr inbounds %struct.S, ptr %input, i64 0, i32 1
   %b = load ptr, ptr %b_ptr, align 8
   %v = load i32, ptr %b, align 4
-; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]]
+; CHECK: ld.global.b32 %[[val:.*]], [%[[iptr_g]]]
   store i32 %v, ptr %output, align 4
-; CHECK: st.global.u32 [%[[optr_g]]], %[[val]]
+; CHECK: st.global.b32 [%[[optr_g]]], %[[val]]
   ret void
 }
 
@@ -49,14 +49,14 @@ define ptx_kernel void @ptr_in_byval_kernel(ptr byval(%struct.S) %input, ptr %ou
 ; There's also no assumption that all pointers within are in global space.
 define void @ptr_in_byval_func(ptr byval(%struct.S) %input, ptr %output) {
 ; CHECK-LABEL: .visible .func ptr_in_byval_func(
-; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_func_param_1]
-; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
+; CHECK: ld.param.b64 	%[[optr:rd.*]], [ptr_in_byval_func_param_1]
+; CHECK: ld.param.b64 	%[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
   %b_ptr = getelementptr inbounds %struct.S, ptr %input, i64 0, i32 1
   %b = load ptr, ptr %b_ptr, align 8
   %v = load i32, ptr %b, align 4
-; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]]
+; CHECK: ld.b32 %[[val:.*]], [%[[iptr]]]
   store i32 %v, ptr %output, align 4
-; CHECK: st.u32 [%[[optr]]], %[[val]]
+; CHECK: st.b32 [%[[optr]]], %[[val]]
   ret void
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/machine-sink.ll b/llvm/test/CodeGen/NVPTX/machine-sink.ll
index bcd19df1f6a69..ce16a41a11ff8 100644
--- a/llvm/test/CodeGen/NVPTX/machine-sink.ll
+++ b/llvm/test/CodeGen/NVPTX/machine-sink.ll
@@ -17,8 +17,8 @@ define float @post_dominate(float %x, i1 %cond) {
 entry:
   %0 = load float, ptr addrspacecast (ptr addrspace(3) @scalar1 to ptr), align 4
   %1 = load float, ptr addrspacecast (ptr addrspace(3) @scalar2 to ptr), align 4
-; CHECK: ld.shared.f32
-; CHECK: ld.shared.f32
+; CHECK: ld.shared.b32
+; CHECK: ld.shared.b32
   %2 = fmul float %0, %0
   %3 = fmul float %1, %2
 ; CHECK-NOT: bra
diff --git a/llvm/test/CodeGen/NVPTX/match.ll b/llvm/test/CodeGen/NVPTX/match.ll
index 4e783e8009f0e..ae01b0d3cc7e0 100644
--- a/llvm/test/CodeGen/NVPTX/match.ll
+++ b/llvm/test/CodeGen/NVPTX/match.ll
@@ -6,8 +6,8 @@ declare i32 @llvm.nvvm.match.any.sync.i64(i32, i64)
 
 ; CHECK-LABEL: .func{{.*}}match_any_sync_i32
 define i32 @match_any_sync_i32(i32 %mask, i32 %value) {
-  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match_any_sync_i32_param_0];
-  ; CHECK: ld.param.u32 	[[VALUE:%r[0-9]+]], [match_any_sync_i32_param_1];
+  ; CHECK: ld.param.b32 	[[MASK:%r[0-9]+]], [match_any_sync_i32_param_0];
+  ; CHECK: ld.param.b32 	[[VALUE:%r[0-9]+]], [match_any_sync_i32_param_1];
 
   ; CHECK:  match.any.sync.b32  [[V0:%r[0-9]+]], [[VALUE]], [[MASK]];
   %v0 = call i32 @llvm.nvvm.match.any.sync.i32(i32 %mask, i32 %value)
@@ -25,8 +25,8 @@ define i32 @match_any_sync_i32(i32 %mask, i32 %value) {
 
 ; CHECK-LABEL: .func{{.*}}match_any_sync_i64
 define i32 @match_any_sync_i64(i32 %mask, i64 %value) {
-  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match_any_sync_i64_param_0];
-  ; CHECK: ld.param.u64 	[[VALUE:%rd[0-9]+]], [match_any_sync_i64_param_1];
+  ; CHECK: ld.param.b32 	[[MASK:%r[0-9]+]], [match_any_sync_i64_param_0];
+  ; CHECK: ld.param.b64 	[[VALUE:%rd[0-9]+]], [match_any_sync_i64_param_1];
 
   ; CHECK:  match.any.sync.b64  [[V0:%r[0-9]+]], [[VALUE]], [[MASK]];
   %v0 = call i32 @llvm.nvvm.match.any.sync.i64(i32 %mask, i64 %value)
@@ -47,8 +47,8 @@ declare {i32, i1} @llvm.nvvm.match.all.sync.i64p(i32, i64)
 
 ; CHECK-LABEL: .func{{.*}}match_all_sync_i32p(
 define {i32,i1} @match_all_sync_i32p(i32 %mask, i32 %value) {
-  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match_all_sync_i32p_param_0];
-  ; CHECK: ld.param.u32 	[[VALUE:%r[0-9]+]], [match_all_sync_i32p_param_1];
+  ; CHECK: ld.param.b32 	[[MASK:%r[0-9]+]], [match_all_sync_i32p_param_0];
+  ; CHECK: ld.param.b32 	[[VALUE:%r[0-9]+]], [match_all_sync_i32p_param_1];
 
   ; CHECK:  match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, [[VALUE]], [[MASK]];
   %r1 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 %mask, i32 %value)
@@ -83,8 +83,8 @@ define {i32,i1} @match_all_sync_i32p(i32 %mask, i32 %value) {
 
 ; CHECK-LABEL: .func{{.*}}match_all_sync_i64p(
 define {i32,i1} @match_all_sync_i64p(i32 %mask, i64 %value) {
-  ; CHECK: ld.param.u32 	[[MASK:%r[0-9]+]], [match_all_sync_i64p_param_0];
-  ; CHECK: ld.param.u64 	[[VALUE:%rd[0-9]+]], [match_all_sync_i64p_param_1];
+  ; CHECK: ld.param.b32 	[[MASK:%r[0-9]+]], [match_all_sync_i64p_param_0];
+  ; CHECK: ld.param.b64 	[[VALUE:%rd[0-9]+]], [match_all_sync_i64p_param_1];
 
   ; CHECK:  match.all.sync.b64 {{%r[0-9]+\|%p[0-9]+}}, [[VALUE]], [[MASK]];
   %r1 = call {i32, i1} @llvm.nvvm.match.all.sync.i64p(i32 %mask, i64 %value)
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll
index a6d01c16c0ab8..c7c1ea84f9a39 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -53,9 +53,9 @@ define float @ceil_float(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [ceil_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [ceil_float_param_0];
 ; CHECK-NEXT:    cvt.rpi.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.ceil.f32(float %a)
   ret float %b
@@ -67,9 +67,9 @@ define float @ceil_float_ftz(float %a) #1 {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [ceil_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [ceil_float_ftz_param_0];
 ; CHECK-NEXT:    cvt.rpi.ftz.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.ceil.f32(float %a)
   ret float %b
@@ -81,9 +81,9 @@ define double @ceil_double(double %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [ceil_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [ceil_double_param_0];
 ; CHECK-NEXT:    cvt.rpi.f64.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.ceil.f64(double %a)
   ret double %b
@@ -97,9 +97,9 @@ define float @floor_float(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [floor_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [floor_float_param_0];
 ; CHECK-NEXT:    cvt.rmi.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.floor.f32(float %a)
   ret float %b
@@ -111,9 +111,9 @@ define float @floor_float_ftz(float %a) #1 {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [floor_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [floor_float_ftz_param_0];
 ; CHECK-NEXT:    cvt.rmi.ftz.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.floor.f32(float %a)
   ret float %b
@@ -125,9 +125,9 @@ define double @floor_double(double %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [floor_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [floor_double_param_0];
 ; CHECK-NEXT:    cvt.rmi.f64.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.floor.f64(double %a)
   ret double %b
@@ -144,7 +144,7 @@ define float @round_float(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [round_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [round_float_param_0];
 ; CHECK-NEXT:    mov.b32 %r1, %f1;
 ; CHECK-NEXT:    and.b32 %r2, %r1, -2147483648;
 ; CHECK-NEXT:    or.b32 %r3, %r2, 1056964608;
@@ -157,7 +157,7 @@ define float @round_float(float %a) {
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %f7, %f1;
 ; CHECK-NEXT:    setp.lt.f32 %p2, %f5, 0f3F000000;
 ; CHECK-NEXT:    selp.f32 %f8, %f7, %f6, %p2;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f8;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f8;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.round.f32(float %a)
   ret float %b
@@ -172,7 +172,7 @@ define float @round_float_ftz(float %a) #1 {
 ; CHECK-NEXT:    .reg .b32 %f<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [round_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [round_float_ftz_param_0];
 ; CHECK-NEXT:    mov.b32 %r1, %f1;
 ; CHECK-NEXT:    and.b32 %r2, %r1, -2147483648;
 ; CHECK-NEXT:    or.b32 %r3, %r2, 1056964608;
@@ -185,7 +185,7 @@ define float @round_float_ftz(float %a) #1 {
 ; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %f7, %f1;
 ; CHECK-NEXT:    setp.lt.ftz.f32 %p2, %f5, 0f3F000000;
 ; CHECK-NEXT:    selp.f32 %f8, %f7, %f6, %p2;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f8;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f8;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.round.f32(float %a)
   ret float %b
@@ -199,7 +199,7 @@ define double @round_double(double %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [round_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [round_double_param_0];
 ; CHECK-NEXT:    abs.f64 %fd2, %fd1;
 ; CHECK-NEXT:    setp.lt.f64 %p1, %fd2, 0d3FE0000000000000;
 ; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FE0000000000000;
@@ -208,7 +208,7 @@ define double @round_double(double %a) {
 ; CHECK-NEXT:    copysign.f64 %fd6, %fd1, %fd5;
 ; CHECK-NEXT:    setp.gt.f64 %p2, %fd2, 0d4330000000000000;
 ; CHECK-NEXT:    selp.f64 %fd7, %fd1, %fd6, %p2;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd7;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd7;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.round.f64(double %a)
   ret double %b
@@ -222,9 +222,9 @@ define float @nearbyint_float(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [nearbyint_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [nearbyint_float_param_0];
 ; CHECK-NEXT:    cvt.rni.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.nearbyint.f32(float %a)
   ret float %b
@@ -236,9 +236,9 @@ define float @nearbyint_float_ftz(float %a) #1 {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [nearbyint_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [nearbyint_float_ftz_param_0];
 ; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.nearbyint.f32(float %a)
   ret float %b
@@ -250,9 +250,9 @@ define double @nearbyint_double(double %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [nearbyint_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [nearbyint_double_param_0];
 ; CHECK-NEXT:    cvt.rni.f64.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.nearbyint.f64(double %a)
   ret double %b
@@ -266,9 +266,9 @@ define float @rint_float(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [rint_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [rint_float_param_0];
 ; CHECK-NEXT:    cvt.rni.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.rint.f32(float %a)
   ret float %b
@@ -280,9 +280,9 @@ define float @rint_float_ftz(float %a) #1 {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [rint_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [rint_float_ftz_param_0];
 ; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.rint.f32(float %a)
   ret float %b
@@ -294,9 +294,9 @@ define double @rint_double(double %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [rint_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [rint_double_param_0];
 ; CHECK-NEXT:    cvt.rni.f64.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.rint.f64(double %a)
   ret double %b
@@ -310,9 +310,9 @@ define float @roundeven_float(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [roundeven_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [roundeven_float_param_0];
 ; CHECK-NEXT:    cvt.rni.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.roundeven.f32(float %a)
   ret float %b
@@ -324,9 +324,9 @@ define float @roundeven_float_ftz(float %a) #1 {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [roundeven_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [roundeven_float_ftz_param_0];
 ; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.roundeven.f32(float %a)
   ret float %b
@@ -338,9 +338,9 @@ define double @roundeven_double(double %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [roundeven_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [roundeven_double_param_0];
 ; CHECK-NEXT:    cvt.rni.f64.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.roundeven.f64(double %a)
   ret double %b
@@ -354,9 +354,9 @@ define float @trunc_float(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [trunc_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [trunc_float_param_0];
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.trunc.f32(float %a)
   ret float %b
@@ -368,9 +368,9 @@ define float @trunc_float_ftz(float %a) #1 {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [trunc_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [trunc_float_ftz_param_0];
 ; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.trunc.f32(float %a)
   ret float %b
@@ -382,9 +382,9 @@ define double @trunc_double(double %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [trunc_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [trunc_double_param_0];
 ; CHECK-NEXT:    cvt.rzi.f64.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.trunc.f64(double %a)
   ret double %b
@@ -398,9 +398,9 @@ define float @abs_float(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [abs_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [abs_float_param_0];
 ; CHECK-NEXT:    abs.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.fabs.f32(float %a)
   ret float %b
@@ -412,9 +412,9 @@ define float @abs_float_ftz(float %a) #1 {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [abs_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [abs_float_ftz_param_0];
 ; CHECK-NEXT:    abs.ftz.f32 %f2, %f1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %b = call float @llvm.fabs.f32(float %a)
   ret float %b
@@ -426,9 +426,9 @@ define double @abs_double(double %a) {
 ; CHECK-NEXT:    .reg .b64 %fd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [abs_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [abs_double_param_0];
 ; CHECK-NEXT:    abs.f64 %fd2, %fd1;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd2;
 ; CHECK-NEXT:    ret;
   %b = call double @llvm.fabs.f64(double %a)
   ret double %b
@@ -487,10 +487,10 @@ define float @minnum_float(float %a, float %b) {
 ; CHECK-NEXT:    .reg .b32 %f<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [minnum_float_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [minnum_float_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [minnum_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [minnum_float_param_1];
 ; CHECK-NEXT:    min.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float %a, float %b)
   ret float %x
@@ -502,9 +502,9 @@ define float @minnum_imm1(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [minnum_imm1_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [minnum_imm1_param_0];
 ; CHECK-NEXT:    min.f32 %f2, %f1, 0f00000000;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float %a, float 0.0)
   ret float %x
@@ -516,9 +516,9 @@ define float @minnum_imm2(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [minnum_imm2_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [minnum_imm2_param_0];
 ; CHECK-NEXT:    min.f32 %f2, %f1, 0f00000000;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float 0.0, float %a)
   ret float %x
@@ -530,10 +530,10 @@ define float @minnum_float_ftz(float %a, float %b) #1 {
 ; CHECK-NEXT:    .reg .b32 %f<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [minnum_float_ftz_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [minnum_float_ftz_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [minnum_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [minnum_float_ftz_param_1];
 ; CHECK-NEXT:    min.ftz.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float %a, float %b)
   ret float %x
@@ -545,10 +545,10 @@ define double @minnum_double(double %a, double %b) {
 ; CHECK-NEXT:    .reg .b64 %fd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [minnum_double_param_0];
-; CHECK-NEXT:    ld.param.f64 %fd2, [minnum_double_param_1];
+; CHECK-NEXT:    ld.param.b64 %fd1, [minnum_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd2, [minnum_double_param_1];
 ; CHECK-NEXT:    min.f64 %fd3, %fd1, %fd2;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd3;
 ; CHECK-NEXT:    ret;
   %x = call double @llvm.minnum.f64(double %a, double %b)
   ret double %x
@@ -690,9 +690,9 @@ define float @minimum_float(float %a, float %b) {
 ; CHECK-NOF16-NEXT:    .reg .b32 %f<8>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [minimum_float_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [minimum_float_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NOF16-NEXT:    ld.param.f32 %f2, [minimum_float_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %f2, [minimum_float_param_1];
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f2;
 ; CHECK-NOF16-NEXT:    min.f32 %f3, %f1, %f2;
 ; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
@@ -703,7 +703,7 @@ define float @minimum_float(float %a, float %b) {
 ; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p4, %f4, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
-; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0], %f7;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f7;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: minimum_float(
@@ -711,10 +711,10 @@ define float @minimum_float(float %a, float %b) {
 ; CHECK-F16-NEXT:    .reg .b32 %f<4>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.f32 %f1, [minimum_float_param_0];
-; CHECK-F16-NEXT:    ld.param.f32 %f2, [minimum_float_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %f1, [minimum_float_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %f2, [minimum_float_param_1];
 ; CHECK-F16-NEXT:    min.NaN.f32 %f3, %f1, %f2;
-; CHECK-F16-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: minimum_float(
@@ -722,10 +722,10 @@ define float @minimum_float(float %a, float %b) {
 ; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [minimum_float_param_0];
-; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f2, [minimum_float_param_1];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [minimum_float_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f2, [minimum_float_param_1];
 ; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %f3, %f1, %f2;
-; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.minimum.f32(float %a, float %b)
   ret float %x
@@ -739,7 +739,7 @@ define float @minimum_imm1(float %a) {
 ; CHECK-NOF16-NEXT:    .reg .b32 %f<6>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [minimum_imm1_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [minimum_imm1_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
 ; CHECK-NOF16-NEXT:    min.f32 %f2, %f1, 0f00000000;
@@ -748,7 +748,7 @@ define float @minimum_imm1(float %a) {
 ; CHECK-NOF16-NEXT:    selp.f32 %f4, %f1, %f3, %p2;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p3, %f3, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.f32 %f5, %f4, %f3, %p3;
-; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0], %f5;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f5;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: minimum_imm1(
@@ -756,9 +756,9 @@ define float @minimum_imm1(float %a) {
 ; CHECK-F16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.f32 %f1, [minimum_imm1_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %f1, [minimum_imm1_param_0];
 ; CHECK-F16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-F16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: minimum_imm1(
@@ -766,9 +766,9 @@ define float @minimum_imm1(float %a) {
 ; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [minimum_imm1_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [minimum_imm1_param_0];
 ; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.minimum.f32(float %a, float 0.0)
   ret float %x
@@ -782,7 +782,7 @@ define float @minimum_imm2(float %a) {
 ; CHECK-NOF16-NEXT:    .reg .b32 %f<6>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [minimum_imm2_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [minimum_imm2_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
 ; CHECK-NOF16-NEXT:    min.f32 %f2, %f1, 0f00000000;
@@ -791,7 +791,7 @@ define float @minimum_imm2(float %a) {
 ; CHECK-NOF16-NEXT:    selp.f32 %f4, %f1, %f3, %p2;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p3, %f3, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.f32 %f5, %f4, %f3, %p3;
-; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0], %f5;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f5;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: minimum_imm2(
@@ -799,9 +799,9 @@ define float @minimum_imm2(float %a) {
 ; CHECK-F16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.f32 %f1, [minimum_imm2_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %f1, [minimum_imm2_param_0];
 ; CHECK-F16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-F16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: minimum_imm2(
@@ -809,9 +809,9 @@ define float @minimum_imm2(float %a) {
 ; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [minimum_imm2_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [minimum_imm2_param_0];
 ; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.minimum.f32(float 0.0, float %a)
   ret float %x
@@ -825,9 +825,9 @@ define float @minimum_float_ftz(float %a, float %b) #1 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %f<8>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [minimum_float_ftz_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [minimum_float_ftz_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NOF16-NEXT:    ld.param.f32 %f2, [minimum_float_ftz_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %f2, [minimum_float_ftz_param_1];
 ; CHECK-NOF16-NEXT:    setp.nan.ftz.f32 %p1, %f1, %f2;
 ; CHECK-NOF16-NEXT:    min.ftz.f32 %f3, %f1, %f2;
 ; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
@@ -838,7 +838,7 @@ define float @minimum_float_ftz(float %a, float %b) #1 {
 ; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
 ; CHECK-NOF16-NEXT:    setp.eq.ftz.f32 %p4, %f4, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
-; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0], %f7;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f7;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: minimum_float_ftz(
@@ -846,10 +846,10 @@ define float @minimum_float_ftz(float %a, float %b) #1 {
 ; CHECK-F16-NEXT:    .reg .b32 %f<4>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.f32 %f1, [minimum_float_ftz_param_0];
-; CHECK-F16-NEXT:    ld.param.f32 %f2, [minimum_float_ftz_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %f1, [minimum_float_ftz_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %f2, [minimum_float_ftz_param_1];
 ; CHECK-F16-NEXT:    min.NaN.ftz.f32 %f3, %f1, %f2;
-; CHECK-F16-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: minimum_float_ftz(
@@ -857,10 +857,10 @@ define float @minimum_float_ftz(float %a, float %b) #1 {
 ; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [minimum_float_ftz_param_0];
-; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f2, [minimum_float_ftz_param_1];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [minimum_float_ftz_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f2, [minimum_float_ftz_param_1];
 ; CHECK-SM80-NOF16-NEXT:    min.NaN.ftz.f32 %f3, %f1, %f2;
-; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.minimum.f32(float %a, float %b)
   ret float %x
@@ -874,9 +874,9 @@ define double @minimum_double(double %a, double %b) {
 ; CHECK-NEXT:    .reg .b64 %fd<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [minimum_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [minimum_double_param_0];
 ; CHECK-NEXT:    mov.b64 %rd1, %fd1;
-; CHECK-NEXT:    ld.param.f64 %fd2, [minimum_double_param_1];
+; CHECK-NEXT:    ld.param.b64 %fd2, [minimum_double_param_1];
 ; CHECK-NEXT:    setp.nan.f64 %p1, %fd1, %fd2;
 ; CHECK-NEXT:    min.f64 %fd3, %fd1, %fd2;
 ; CHECK-NEXT:    selp.f64 %fd4, 0d7FF8000000000000, %fd3, %p1;
@@ -887,7 +887,7 @@ define double @minimum_double(double %a, double %b) {
 ; CHECK-NEXT:    selp.f64 %fd6, %fd2, %fd5, %p3;
 ; CHECK-NEXT:    setp.eq.f64 %p4, %fd4, 0d0000000000000000;
 ; CHECK-NEXT:    selp.f64 %fd7, %fd6, %fd4, %p4;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd7;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd7;
 ; CHECK-NEXT:    ret;
   %x = call double @llvm.minimum.f64(double %a, double %b)
   ret double %x
@@ -1045,9 +1045,9 @@ define float @maxnum_imm1(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [maxnum_imm1_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [maxnum_imm1_param_0];
 ; CHECK-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float %a, float 0.0)
   ret float %x
@@ -1059,9 +1059,9 @@ define float @maxnum_imm2(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [maxnum_imm2_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [maxnum_imm2_param_0];
 ; CHECK-NEXT:    max.f32 %f2, %f1, 0f00000000;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float 0.0, float %a)
   ret float %x
@@ -1073,10 +1073,10 @@ define float @maxnum_float(float %a, float %b) {
 ; CHECK-NEXT:    .reg .b32 %f<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [maxnum_float_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [maxnum_float_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [maxnum_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [maxnum_float_param_1];
 ; CHECK-NEXT:    max.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float %a, float %b)
   ret float %x
@@ -1088,10 +1088,10 @@ define float @maxnum_float_ftz(float %a, float %b) #1 {
 ; CHECK-NEXT:    .reg .b32 %f<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [maxnum_float_ftz_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [maxnum_float_ftz_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [maxnum_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [maxnum_float_ftz_param_1];
 ; CHECK-NEXT:    max.ftz.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float %a, float %b)
   ret float %x
@@ -1103,10 +1103,10 @@ define double @maxnum_double(double %a, double %b) {
 ; CHECK-NEXT:    .reg .b64 %fd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [maxnum_double_param_0];
-; CHECK-NEXT:    ld.param.f64 %fd2, [maxnum_double_param_1];
+; CHECK-NEXT:    ld.param.b64 %fd1, [maxnum_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd2, [maxnum_double_param_1];
 ; CHECK-NEXT:    max.f64 %fd3, %fd1, %fd2;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd3;
 ; CHECK-NEXT:    ret;
   %x = call double @llvm.maxnum.f64(double %a, double %b)
   ret double %x
@@ -1247,13 +1247,13 @@ define float @maximum_imm1(float %a) {
 ; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [maximum_imm1_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [maximum_imm1_param_0];
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
 ; CHECK-NOF16-NEXT:    max.f32 %f2, %f1, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %f3, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f00000000, %f3, %p2;
-; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0], %f4;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f4;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: maximum_imm1(
@@ -1261,9 +1261,9 @@ define float @maximum_imm1(float %a) {
 ; CHECK-F16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.f32 %f1, [maximum_imm1_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %f1, [maximum_imm1_param_0];
 ; CHECK-F16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-F16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: maximum_imm1(
@@ -1271,9 +1271,9 @@ define float @maximum_imm1(float %a) {
 ; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [maximum_imm1_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [maximum_imm1_param_0];
 ; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.maximum.f32(float %a, float 0.0)
   ret float %x
@@ -1286,13 +1286,13 @@ define float @maximum_imm2(float %a) {
 ; CHECK-NOF16-NEXT:    .reg .b32 %f<5>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [maximum_imm2_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [maximum_imm2_param_0];
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
 ; CHECK-NOF16-NEXT:    max.f32 %f2, %f1, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %f3, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f00000000, %f3, %p2;
-; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0], %f4;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f4;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: maximum_imm2(
@@ -1300,9 +1300,9 @@ define float @maximum_imm2(float %a) {
 ; CHECK-F16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.f32 %f1, [maximum_imm2_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %f1, [maximum_imm2_param_0];
 ; CHECK-F16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-F16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: maximum_imm2(
@@ -1310,9 +1310,9 @@ define float @maximum_imm2(float %a) {
 ; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<3>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [maximum_imm2_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [maximum_imm2_param_0];
 ; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.maximum.f32(float 0.0, float %a)
   ret float %x
@@ -1326,9 +1326,9 @@ define float @maximum_float(float %a, float %b) {
 ; CHECK-NOF16-NEXT:    .reg .b32 %f<8>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [maximum_float_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [maximum_float_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NOF16-NEXT:    ld.param.f32 %f2, [maximum_float_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %f2, [maximum_float_param_1];
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f2;
 ; CHECK-NOF16-NEXT:    max.f32 %f3, %f1, %f2;
 ; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
@@ -1339,7 +1339,7 @@ define float @maximum_float(float %a, float %b) {
 ; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p4, %f4, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
-; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0], %f7;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f7;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: maximum_float(
@@ -1347,10 +1347,10 @@ define float @maximum_float(float %a, float %b) {
 ; CHECK-F16-NEXT:    .reg .b32 %f<4>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.f32 %f1, [maximum_float_param_0];
-; CHECK-F16-NEXT:    ld.param.f32 %f2, [maximum_float_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %f1, [maximum_float_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %f2, [maximum_float_param_1];
 ; CHECK-F16-NEXT:    max.NaN.f32 %f3, %f1, %f2;
-; CHECK-F16-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: maximum_float(
@@ -1358,10 +1358,10 @@ define float @maximum_float(float %a, float %b) {
 ; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [maximum_float_param_0];
-; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f2, [maximum_float_param_1];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [maximum_float_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f2, [maximum_float_param_1];
 ; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %f3, %f1, %f2;
-; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.maximum.f32(float %a, float %b)
   ret float %x
@@ -1375,9 +1375,9 @@ define float @maximum_float_ftz(float %a, float %b) #1 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %f<8>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [maximum_float_ftz_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %f1, [maximum_float_ftz_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
-; CHECK-NOF16-NEXT:    ld.param.f32 %f2, [maximum_float_ftz_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %f2, [maximum_float_ftz_param_1];
 ; CHECK-NOF16-NEXT:    setp.nan.ftz.f32 %p1, %f1, %f2;
 ; CHECK-NOF16-NEXT:    max.ftz.f32 %f3, %f1, %f2;
 ; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
@@ -1388,7 +1388,7 @@ define float @maximum_float_ftz(float %a, float %b) #1 {
 ; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
 ; CHECK-NOF16-NEXT:    setp.eq.ftz.f32 %p4, %f4, 0f00000000;
 ; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
-; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0], %f7;
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %f7;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: maximum_float_ftz(
@@ -1396,10 +1396,10 @@ define float @maximum_float_ftz(float %a, float %b) #1 {
 ; CHECK-F16-NEXT:    .reg .b32 %f<4>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
-; CHECK-F16-NEXT:    ld.param.f32 %f1, [maximum_float_ftz_param_0];
-; CHECK-F16-NEXT:    ld.param.f32 %f2, [maximum_float_ftz_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %f1, [maximum_float_ftz_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %f2, [maximum_float_ftz_param_1];
 ; CHECK-F16-NEXT:    max.NaN.ftz.f32 %f3, %f1, %f2;
-; CHECK-F16-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-SM80-NOF16-LABEL: maximum_float_ftz(
@@ -1407,10 +1407,10 @@ define float @maximum_float_ftz(float %a, float %b) #1 {
 ; CHECK-SM80-NOF16-NEXT:    .reg .b32 %f<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
-; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [maximum_float_ftz_param_0];
-; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f2, [maximum_float_ftz_param_1];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f1, [maximum_float_ftz_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %f2, [maximum_float_ftz_param_1];
 ; CHECK-SM80-NOF16-NEXT:    max.NaN.ftz.f32 %f3, %f1, %f2;
-; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call float @llvm.maximum.f32(float %a, float %b)
   ret float %x
@@ -1424,9 +1424,9 @@ define double @maximum_double(double %a, double %b) {
 ; CHECK-NEXT:    .reg .b64 %fd<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [maximum_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [maximum_double_param_0];
 ; CHECK-NEXT:    mov.b64 %rd1, %fd1;
-; CHECK-NEXT:    ld.param.f64 %fd2, [maximum_double_param_1];
+; CHECK-NEXT:    ld.param.b64 %fd2, [maximum_double_param_1];
 ; CHECK-NEXT:    setp.nan.f64 %p1, %fd1, %fd2;
 ; CHECK-NEXT:    max.f64 %fd3, %fd1, %fd2;
 ; CHECK-NEXT:    selp.f64 %fd4, 0d7FF8000000000000, %fd3, %p1;
@@ -1437,7 +1437,7 @@ define double @maximum_double(double %a, double %b) {
 ; CHECK-NEXT:    selp.f64 %fd6, %fd2, %fd5, %p3;
 ; CHECK-NEXT:    setp.eq.f64 %p4, %fd4, 0d0000000000000000;
 ; CHECK-NEXT:    selp.f64 %fd7, %fd6, %fd4, %p4;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd7;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd7;
 ; CHECK-NEXT:    ret;
   %x = call double @llvm.maximum.f64(double %a, double %b)
   ret double %x
@@ -1550,11 +1550,11 @@ define float @fma_float(float %a, float %b, float %c) {
 ; CHECK-NEXT:    .reg .b32 %f<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [fma_float_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [fma_float_param_1];
-; CHECK-NEXT:    ld.param.f32 %f3, [fma_float_param_2];
+; CHECK-NEXT:    ld.param.b32 %f1, [fma_float_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [fma_float_param_1];
+; CHECK-NEXT:    ld.param.b32 %f3, [fma_float_param_2];
 ; CHECK-NEXT:    fma.rn.f32 %f4, %f1, %f2, %f3;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f4;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.fma.f32(float %a, float %b, float %c)
   ret float %x
@@ -1566,11 +1566,11 @@ define float @fma_float_ftz(float %a, float %b, float %c) #1 {
 ; CHECK-NEXT:    .reg .b32 %f<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [fma_float_ftz_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [fma_float_ftz_param_1];
-; CHECK-NEXT:    ld.param.f32 %f3, [fma_float_ftz_param_2];
+; CHECK-NEXT:    ld.param.b32 %f1, [fma_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [fma_float_ftz_param_1];
+; CHECK-NEXT:    ld.param.b32 %f3, [fma_float_ftz_param_2];
 ; CHECK-NEXT:    fma.rn.ftz.f32 %f4, %f1, %f2, %f3;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f4;
 ; CHECK-NEXT:    ret;
   %x = call float @llvm.fma.f32(float %a, float %b, float %c)
   ret float %x
@@ -1582,11 +1582,11 @@ define double @fma_double(double %a, double %b, double %c) {
 ; CHECK-NEXT:    .reg .b64 %fd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [fma_double_param_0];
-; CHECK-NEXT:    ld.param.f64 %fd2, [fma_double_param_1];
-; CHECK-NEXT:    ld.param.f64 %fd3, [fma_double_param_2];
+; CHECK-NEXT:    ld.param.b64 %fd1, [fma_double_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd2, [fma_double_param_1];
+; CHECK-NEXT:    ld.param.b64 %fd3, [fma_double_param_2];
 ; CHECK-NEXT:    fma.rn.f64 %fd4, %fd1, %fd2, %fd3;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd4;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd4;
 ; CHECK-NEXT:    ret;
   %x = call double @llvm.fma.f64(double %a, double %b, double %c)
   ret double %x
diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
index b3abcc1a21d2c..db8733da5b7e4 100644
--- a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
+++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
@@ -8,8 +8,8 @@ target triple = "nvptx64-nvidia-cuda"
 define <4 x float> @t1(ptr %p1) {
 ; CHECK-NOT: ld.v4
 ; CHECK-NOT: ld.v2
-; CHECK-NOT: ld.f32
-; CHECK: ld.u8
+; CHECK-NOT: ld.b32
+; CHECK: ld.b8
   %r = load <4 x float>, ptr %p1, align 1
   ret <4 x float> %r
 }
@@ -18,7 +18,7 @@ define <4 x float> @t1(ptr %p1) {
 define <4 x float> @t2(ptr %p1) {
 ; CHECK-NOT: ld.v4
 ; CHECK-NOT: ld.v2
-; CHECK: ld.f32
+; CHECK: ld.b32
   %r = load <4 x float>, ptr %p1, align 4
   ret <4 x float> %r
 }
@@ -39,12 +39,12 @@ define <4 x float> @t4(ptr %p1) {
 }
 
 ; CHECK-LABEL: .visible .func test_v1halfp0a1(
-; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_v1halfp0a1_param_0];
-; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_v1halfp0a1_param_1];
-; CHECK-DAG: ld.u8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
-; CHECK-DAG: st.u8        [%[[TO]]], [[B0]]
-; CHECK-DAG: ld.u8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
-; CHECK-DAG: st.u8        [%[[TO]]+1], [[B1]]
+; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_v1halfp0a1_param_0];
+; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_v1halfp0a1_param_1];
+; CHECK-DAG: ld.b8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
+; CHECK-DAG: st.b8        [%[[TO]]], [[B0]]
+; CHECK-DAG: ld.b8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
+; CHECK-DAG: st.b8        [%[[TO]]+1], [[B1]]
 ; CHECK: ret
 define void @test_v1halfp0a1(ptr noalias readonly %from, ptr %to) {
   %1 = load <1 x half>, ptr %from , align 1
@@ -53,16 +53,16 @@ define void @test_v1halfp0a1(ptr noalias readonly %from, ptr %to) {
 }
 
 ; CHECK-LABEL: .visible .func test_v2halfp0a1(
-; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_v2halfp0a1_param_0];
-; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_v2halfp0a1_param_1];
-; CHECK-DAG: ld.u8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
-; CHECK-DAG: st.u8        [%[[TO]]],
-; CHECK-DAG: ld.u8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
-; CHECK-DAG: st.u8        [%[[TO]]+1],
-; CHECK-DAG: ld.u8        [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2]
-; CHECK-DAG: st.u8        [%[[TO]]+2],
-; CHECK-DAG: ld.u8        [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3]
-; CHECK-DAG: st.u8        [%[[TO]]+3],
+; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_v2halfp0a1_param_0];
+; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_v2halfp0a1_param_1];
+; CHECK-DAG: ld.b8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
+; CHECK-DAG: st.b8        [%[[TO]]],
+; CHECK-DAG: ld.b8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
+; CHECK-DAG: st.b8        [%[[TO]]+1],
+; CHECK-DAG: ld.b8        [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2]
+; CHECK-DAG: st.b8        [%[[TO]]+2],
+; CHECK-DAG: ld.b8        [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3]
+; CHECK-DAG: st.b8        [%[[TO]]+3],
 ; CHECK: ret
 define void @test_v2halfp0a1(ptr noalias readonly %from, ptr %to) {
   %1 = load <2 x half>, ptr %from , align 1
@@ -71,24 +71,24 @@ define void @test_v2halfp0a1(ptr noalias readonly %from, ptr %to) {
 }
 
 ; CHECK-LABEL: .visible .func test_v4halfp0a1(
-; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_v4halfp0a1_param_0];
-; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_v4halfp0a1_param_1];
-; CHECK-DAG: ld.u8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
-; CHECK-DAG: st.u8        [%[[TO]]], [[B0]]
-; CHECK-DAG: ld.u8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
-; CHECK-DAG: st.u8        [%[[TO]]+1], [[B1]]
-; CHECK-DAG: ld.u8        [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2]
-; CHECK-DAG: st.u8        [%[[TO]]+2], [[B2]]
-; CHECK-DAG: ld.u8        [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3]
-; CHECK-DAG: st.u8        [%[[TO]]+3], [[B3]]
-; CHECK-DAG: ld.u8        [[B4:%r[sd]?[0-9]+]], [%[[FROM]]+4]
-; CHECK-DAG: st.u8        [%[[TO]]+4], [[B4]]
-; CHECK-DAG: ld.u8        [[B5:%r[sd]?[0-9]+]], [%[[FROM]]+5]
-; CHECK-DAG: st.u8        [%[[TO]]+5], [[B5]]
-; CHECK-DAG: ld.u8        [[B6:%r[sd]?[0-9]+]], [%[[FROM]]+6]
-; CHECK-DAG: st.u8        [%[[TO]]+6], [[B6]]
-; CHECK-DAG: ld.u8        [[B7:%r[sd]?[0-9]+]], [%[[FROM]]+7]
-; CHECK-DAG: st.u8        [%[[TO]]+7], [[B7]]
+; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_v4halfp0a1_param_0];
+; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_v4halfp0a1_param_1];
+; CHECK-DAG: ld.b8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
+; CHECK-DAG: st.b8        [%[[TO]]], [[B0]]
+; CHECK-DAG: ld.b8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
+; CHECK-DAG: st.b8        [%[[TO]]+1], [[B1]]
+; CHECK-DAG: ld.b8        [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2]
+; CHECK-DAG: st.b8        [%[[TO]]+2], [[B2]]
+; CHECK-DAG: ld.b8        [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3]
+; CHECK-DAG: st.b8        [%[[TO]]+3], [[B3]]
+; CHECK-DAG: ld.b8        [[B4:%r[sd]?[0-9]+]], [%[[FROM]]+4]
+; CHECK-DAG: st.b8        [%[[TO]]+4], [[B4]]
+; CHECK-DAG: ld.b8        [[B5:%r[sd]?[0-9]+]], [%[[FROM]]+5]
+; CHECK-DAG: st.b8        [%[[TO]]+5], [[B5]]
+; CHECK-DAG: ld.b8        [[B6:%r[sd]?[0-9]+]], [%[[FROM]]+6]
+; CHECK-DAG: st.b8        [%[[TO]]+6], [[B6]]
+; CHECK-DAG: ld.b8        [[B7:%r[sd]?[0-9]+]], [%[[FROM]]+7]
+; CHECK-DAG: st.b8        [%[[TO]]+7], [[B7]]
 ; CHECK: ret
 define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) {
   %1 = load <4 x half>, ptr %from , align 1
@@ -101,8 +101,8 @@ define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) {
 define void @s1(ptr %p1, <4 x float> %v) {
 ; CHECK-NOT: st.v4
 ; CHECK-NOT: st.v2
-; CHECK-NOT: st.f32
-; CHECK: st.u8
+; CHECK-NOT: st.b32
+; CHECK: st.b8
   store <4 x float> %v, ptr %p1, align 1
   ret void
 }
@@ -111,7 +111,7 @@ define void @s1(ptr %p1, <4 x float> %v) {
 define void @s2(ptr %p1, <4 x float> %v) {
 ; CHECK-NOT: st.v4
 ; CHECK-NOT: st.v2
-; CHECK: st.f32
+; CHECK: st.b32
   store <4 x float> %v, ptr %p1, align 4
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
index fb4c653b709f3..2e12c5041b06b 100644
--- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -12,22 +12,22 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-NEXT:    .reg .b64 %fd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %bb
-; CHECK-NEXT:    ld.param.u32 %r4, [wombat_param_2];
-; CHECK-NEXT:    ld.param.u32 %r3, [wombat_param_1];
-; CHECK-NEXT:    ld.param.u32 %r2, [wombat_param_0];
+; CHECK-NEXT:    ld.param.b32 %r4, [wombat_param_2];
+; CHECK-NEXT:    ld.param.b32 %r3, [wombat_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [wombat_param_0];
 ; CHECK-NEXT:    mov.b32 %r10, 0;
 ; CHECK-NEXT:  $L__BB0_1: // %bb3
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .b64 param0;
-; CHECK-NEXT:    st.param.f64 [param0], 0d0000000000000000;
+; CHECK-NEXT:    st.param.b64 [param0], 0d0000000000000000;
 ; CHECK-NEXT:    .param .b64 retval0;
 ; CHECK-NEXT:    call.uni (retval0),
 ; CHECK-NEXT:    quux,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
-; CHECK-NEXT:    ld.param.f64 %fd1, [retval0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    mul.lo.s32 %r7, %r10, %r3;
 ; CHECK-NEXT:    or.b32 %r8, %r4, %r7;
@@ -36,7 +36,7 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-NEXT:    cvt.rn.f64.u32 %fd4, %r10;
 ; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, %fd3;
 ; CHECK-NEXT:    mov.b64 %rd1, 0;
-; CHECK-NEXT:    st.global.f64 [%rd1], %fd5;
+; CHECK-NEXT:    st.global.b64 [%rd1], %fd5;
 ; CHECK-NEXT:    mov.b32 %r10, 1;
 ; CHECK-NEXT:    bra.uni $L__BB0_1;
 bb:
diff --git a/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll b/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll
index 21fce55fcbc24..8a88e1b26c7ff 100644
--- a/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll
@@ -9,8 +9,8 @@ define i16 @test_mulhi_i16(i16 %x, i16 %y) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [test_mulhi_i16_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [test_mulhi_i16_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_mulhi_i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [test_mulhi_i16_param_1];
 ; CHECK-NEXT:    mul.hi.s16 %rs3, %rs1, %rs2;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -26,8 +26,8 @@ define i16 @test_mulhi_u16(i16 %x, i16 %y) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [test_mulhi_u16_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [test_mulhi_u16_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_mulhi_u16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [test_mulhi_u16_param_1];
 ; CHECK-NEXT:    mul.hi.u16 %rs3, %rs1, %rs2;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -42,8 +42,8 @@ define i32 @test_mulhi_i32(i32 %x, i32 %y) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_mulhi_i32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_mulhi_i32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_mulhi_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_mulhi_i32_param_1];
 ; CHECK-NEXT:    mul.hi.s32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -57,8 +57,8 @@ define i32 @test_mulhi_u32(i32 %x, i32 %y) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_mulhi_u32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_mulhi_u32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_mulhi_u32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_mulhi_u32_param_1];
 ; CHECK-NEXT:    mul.hi.u32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -72,8 +72,8 @@ define i64 @test_mulhi_i64(i64 %x, i64 %y) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_mulhi_i64_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_mulhi_i64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_mulhi_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_mulhi_i64_param_1];
 ; CHECK-NEXT:    mul.hi.s64 %rd3, %rd1, %rd2;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
@@ -87,8 +87,8 @@ define i64 @test_mulhi_u64(i64 %x, i64 %y) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_mulhi_u64_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_mulhi_u64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_mulhi_u64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_mulhi_u64_param_1];
 ; CHECK-NEXT:    mul.hi.u64 %rd3, %rd1, %rd2;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/nounroll.ll b/llvm/test/CodeGen/NVPTX/nounroll.ll
index f40c27ecd66be..e078570d4d436 100644
--- a/llvm/test/CodeGen/NVPTX/nounroll.ll
+++ b/llvm/test/CodeGen/NVPTX/nounroll.ll
@@ -20,15 +20,15 @@ for.body:
   %idxprom = sext i32 %i.06 to i64
   %arrayidx = getelementptr inbounds float, ptr %input, i64 %idxprom
   %0 = load float, ptr %arrayidx, align 4
-; CHECK: ld.f32
+; CHECK: ld.b32
   %arrayidx2 = getelementptr inbounds float, ptr %output, i64 %idxprom
   store float %0, ptr %arrayidx2, align 4
-; CHECK: st.f32
+; CHECK: st.b32
   %inc = add nuw nsw i32 %i.06, 1
   %exitcond = icmp eq i32 %inc, 2
   br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
-; CHECK-NOT: ld.f32
-; CHECK-NOT: st.f32
+; CHECK-NOT: ld.b32
+; CHECK-NOT: st.b32
 
 for.end:
   ret void
@@ -50,15 +50,15 @@ for.body:
   %idxprom = sext i32 %i.06 to i64
   %arrayidx = getelementptr inbounds float, ptr %input, i64 %idxprom
   %0 = load float, ptr %arrayidx, align 4
-; CHECK: ld.f32
+; CHECK: ld.b32
   %arrayidx2 = getelementptr inbounds float, ptr %output, i64 %idxprom
   store float %0, ptr %arrayidx2, align 4
-; CHECK: st.f32
+; CHECK: st.b32
   %inc = add nuw nsw i32 %i.06, 1
   %exitcond = icmp eq i32 %inc, 2
   br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2
-; CHECK-NOT: ld.f32
-; CHECK-NOT: st.f32
+; CHECK-NOT: ld.b32
+; CHECK-NOT: st.b32
 
 for.end:
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
index 885c711d31f01..ff04e18701a84 100644
--- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
+++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll
@@ -144,15 +144,15 @@ return:
 
 ;      SM_52: .visible .func  (.param .b32 func_retval0) phi()
 ;      SM_52: mov.b32         %[[REG:.+]], 0f00000000;
-; SM_52-NEXT: st.param.f32    [func_retval0], %[[REG]];
+; SM_52-NEXT: st.param.b32    [func_retval0], %[[REG]];
 ; SM_52-NEXT: ret;
 ;      SM_70: .visible .func  (.param .b32 func_retval0) phi()
 ;      SM_70: mov.b32         %[[REG:.+]], 0f00000000;
-; SM_70-NEXT: st.param.f32    [func_retval0], %[[REG]];
+; SM_70-NEXT: st.param.b32    [func_retval0], %[[REG]];
 ; SM_70-NEXT: ret;
 ;      SM_90: .visible .func  (.param .b32 func_retval0) phi()
 ;      SM_90: mov.b32         %[[REG:.+]], 0f00000000;
-; SM_90-NEXT: st.param.f32    [func_retval0], %[[REG]];
+; SM_90-NEXT: st.param.b32    [func_retval0], %[[REG]];
 ; SM_90-NEXT: ret;
 define float @phi() {
 entry:
diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll
index c8daf3b5760f5..1840de4494157 100644
--- a/llvm/test/CodeGen/NVPTX/param-add.ll
+++ b/llvm/test/CodeGen/NVPTX/param-add.ll
@@ -18,13 +18,13 @@ define i32 @test(%struct.1float alignstack(32) %data) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %r1, [test_param_0+1];
+; CHECK-NEXT:    ld.param.b8 %r1, [test_param_0+1];
 ; CHECK-NEXT:    shl.b32 %r2, %r1, 8;
-; CHECK-NEXT:    ld.param.u8 %r3, [test_param_0];
+; CHECK-NEXT:    ld.param.b8 %r3, [test_param_0];
 ; CHECK-NEXT:    or.b32 %r4, %r2, %r3;
-; CHECK-NEXT:    ld.param.u8 %r5, [test_param_0+3];
+; CHECK-NEXT:    ld.param.b8 %r5, [test_param_0+3];
 ; CHECK-NEXT:    shl.b32 %r6, %r5, 8;
-; CHECK-NEXT:    ld.param.u8 %r7, [test_param_0+2];
+; CHECK-NEXT:    ld.param.b8 %r7, [test_param_0+2];
 ; CHECK-NEXT:    or.b32 %r8, %r6, %r7;
 ; CHECK-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-NEXT:    or.b32 %r17, %r9, %r4;
diff --git a/llvm/test/CodeGen/NVPTX/param-align.ll b/llvm/test/CodeGen/NVPTX/param-align.ll
index 2adc5496d833f..16220fb4d47bb 100644
--- a/llvm/test/CodeGen/NVPTX/param-align.ll
+++ b/llvm/test/CodeGen/NVPTX/param-align.ll
@@ -71,14 +71,14 @@ define ptx_device void @t6() {
 }
 
 ; CHECK-LABEL: .func check_ptr_align1(
-; CHECK: 	ld.param.u64 	%rd1, [check_ptr_align1_param_0];
-; CHECK-NOT: 	ld.param.u8
+; CHECK: 	ld.param.b64 	%rd1, [check_ptr_align1_param_0];
+; CHECK-NOT: 	ld.param.b8
 ; CHECK: 	mov.b32 	%r1, 0;
-; CHECK: 	st.u8 	[%rd1+3], %r1;
-; CHECK: 	st.u8 	[%rd1+2], %r1;
-; CHECK: 	st.u8 	[%rd1+1], %r1;
+; CHECK: 	st.b8 	[%rd1+3], %r1;
+; CHECK: 	st.b8 	[%rd1+2], %r1;
+; CHECK: 	st.b8 	[%rd1+1], %r1;
 ; CHECK: 	mov.b32 	%r2, 1;
-; CHECK: 	st.u8 	[%rd1], %r2;
+; CHECK: 	st.b8 	[%rd1], %r2;
 ; CHECK: 	ret;
 define void @check_ptr_align1(ptr align 1 %_arg_ptr) {
 entry:
@@ -87,12 +87,12 @@ entry:
 }
 
 ; CHECK-LABEL: .func check_ptr_align2(
-; CHECK: 	ld.param.u64 	%rd1, [check_ptr_align2_param_0];
-; CHECK-NOT: 	ld.param.u16
+; CHECK: 	ld.param.b64 	%rd1, [check_ptr_align2_param_0];
+; CHECK-NOT: 	ld.param.b16
 ; CHECK: 	mov.b32 	%r1, 0;
-; CHECK: 	st.u16 	[%rd1+2], %r1;
+; CHECK: 	st.b16 	[%rd1+2], %r1;
 ; CHECK: 	mov.b32 	%r2, 2;
-; CHECK: 	st.u16 	[%rd1], %r2;
+; CHECK: 	st.b16 	[%rd1], %r2;
 ; CHECK: 	ret;
 define void @check_ptr_align2(ptr align 2 %_arg_ptr) {
 entry:
@@ -101,10 +101,10 @@ entry:
 }
 
 ; CHECK-LABEL: .func check_ptr_align4(
-; CHECK: 	ld.param.u64 	%rd1, [check_ptr_align4_param_0];
-; CHECK-NOT: 	ld.param.u32
+; CHECK: 	ld.param.b64 	%rd1, [check_ptr_align4_param_0];
+; CHECK-NOT: 	ld.param.b32
 ; CHECK: 	mov.b32 	%r1, 4;
-; CHECK: 	st.u32 	[%rd1], %r1;
+; CHECK: 	st.b32 	[%rd1], %r1;
 ; CHECK: 	ret;
 define void @check_ptr_align4(ptr align 4 %_arg_ptr) {
 entry:
@@ -113,9 +113,9 @@ entry:
 }
 
 ; CHECK-LABEL: .func check_ptr_align8(
-; CHECK: 	ld.param.u64 	%rd1, [check_ptr_align8_param_0];
+; CHECK: 	ld.param.b64 	%rd1, [check_ptr_align8_param_0];
 ; CHECK: 	mov.b32 	%r1, 8;
-; CHECK: 	st.u32 	[%rd1], %r1;
+; CHECK: 	st.b32 	[%rd1], %r1;
 ; CHECK: 	ret;
 define void @check_ptr_align8(ptr align 8 %_arg_ptr) {
 entry:
diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index 2523fab17d55d..781156082e540 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -24,7 +24,7 @@
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_i1(
 ; CHECK-NEXT: .param .b32 test_i1_param_0
-; CHECK:      ld.param.u8 [[A8:%rs[0-9]+]], [test_i1_param_0];
+; CHECK:      ld.param.b8 [[A8:%rs[0-9]+]], [test_i1_param_0];
 ; CHECK:      and.b16 [[A:%rs[0-9]+]], [[A8]], 1;
 ; CHECK:      setp.ne.b16 %p1, [[A]], 0
 ; CHECK:      cvt.u32.u16 [[B:%r[0-9]+]], [[A8]]
@@ -48,7 +48,7 @@ define i1 @test_i1(i1 %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_i1s(
 ; CHECK-NEXT: .param .b32 test_i1s_param_0
-; CHECK:      ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0];
+; CHECK:      ld.param.b8 [[A8:%rs[0-9]+]], [test_i1s_param_0];
 ; CHECK:      cvt.u32.u16     [[A32:%r[0-9]+]], [[A8]];
 ; CHECK:      and.b32         [[A1:%r[0-9]+]], [[A32]], 1;
 ; CHECK:      neg.s32         [[A:%r[0-9]+]], [[A1]];
@@ -70,8 +70,8 @@ define signext i1 @test_i1s(i1 signext %a) {
 ; CHECK: .func  (.param .align 1 .b8 func_retval0[1])
 ; CHECK-LABEL: test_v3i1(
 ; CHECK-NEXT: .param .align 1 .b8 test_v3i1_param_0[1]
-; CHECK-DAG:  ld.param.u8     [[E2:%rs[0-9]+]], [test_v3i1_param_0+2];
-; CHECK-DAG:  ld.param.u8     [[E0:%rs[0-9]+]], [test_v3i1_param_0]
+; CHECK-DAG:  ld.param.b8     [[E2:%rs[0-9]+]], [test_v3i1_param_0+2];
+; CHECK-DAG:  ld.param.b8     [[E0:%rs[0-9]+]], [test_v3i1_param_0]
 ; CHECK:      .param .align 1 .b8 param0[1];
 ; CHECK-DAG:  st.param.b8     [param0], [[E0]];
 ; CHECK-DAG:  st.param.b8     [param0+2], [[E2]];
@@ -91,7 +91,7 @@ define <3 x i1> @test_v3i1(<3 x i1> %a) {
 ; CHECK: .func  (.param .align 1 .b8 func_retval0[1])
 ; CHECK-LABEL: test_v4i1(
 ; CHECK-NEXT: .param .align 1 .b8 test_v4i1_param_0[1]
-; CHECK:      ld.param.u8 [[E0:%rs[0-9]+]], [test_v4i1_param_0]
+; CHECK:      ld.param.b8 [[E0:%rs[0-9]+]], [test_v4i1_param_0]
 ; CHECK:      .param .align 1 .b8 param0[1];
 ; CHECK:      st.param.b8  [param0], [[E0]];
 ; CHECK:      .param .align 1 .b8 retval0[1];
@@ -114,8 +114,8 @@ define <4 x i1> @test_v4i1(<4 x i1> %a) {
 ; CHECK: .func  (.param .align 1 .b8 func_retval0[1])
 ; CHECK-LABEL: test_v5i1(
 ; CHECK-NEXT: .param .align 1 .b8 test_v5i1_param_0[1]
-; CHECK-DAG:  ld.param.u8     [[E4:%rs[0-9]+]], [test_v5i1_param_0+4];
-; CHECK-DAG:  ld.param.u8     [[E0:%rs[0-9]+]], [test_v5i1_param_0]
+; CHECK-DAG:  ld.param.b8     [[E4:%rs[0-9]+]], [test_v5i1_param_0+4];
+; CHECK-DAG:  ld.param.b8     [[E0:%rs[0-9]+]], [test_v5i1_param_0]
 ; CHECK:      .param .align 1 .b8 param0[1];
 ; CHECK-DAG:  st.param.b8     [param0], [[E0]];
 ; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
@@ -135,7 +135,7 @@ define <5 x i1> @test_v5i1(<5 x i1> %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_i2(
 ; CHECK-NEXT: .param .b32 test_i2_param_0
-; CHECK:      ld.param.u8 {{%rs[0-9]+}}, [test_i2_param_0];
+; CHECK:      ld.param.b8 {{%rs[0-9]+}}, [test_i2_param_0];
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
@@ -152,7 +152,7 @@ define i2 @test_i2(i2 %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_i3(
 ; CHECK-NEXT: .param .b32 test_i3_param_0
-; CHECK:      ld.param.u8 {{%rs[0-9]+}}, [test_i3_param_0];
+; CHECK:      ld.param.b8 {{%rs[0-9]+}}, [test_i3_param_0];
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
@@ -170,7 +170,7 @@ define i3 @test_i3(i3 %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_i8(
 ; CHECK-NEXT: .param .b32 test_i8_param_0
-; CHECK:      ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0];
+; CHECK:      ld.param.b8 [[A8:%rs[0-9]+]], [test_i8_param_0];
 ; CHECK:      cvt.u32.u16     [[A32:%r[0-9]+]], [[A8]];
 ; CHECK:      and.b32         [[A:%r[0-9]+]], [[A32]], 255;
 ; CHECK:      .param .b32 param0;
@@ -212,7 +212,7 @@ define signext i8 @test_i8s(i8 signext %a) {
 ; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
 ; CHECK-LABEL: test_v3i8(
 ; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4]
-; CHECK:      ld.param.u32     [[R:%r[0-9]+]], [test_v3i8_param_0];
+; CHECK:      ld.param.b32     [[R:%r[0-9]+]], [test_v3i8_param_0];
 ; CHECK:      .param .align 4 .b8 param0[4];
 ; CHECK:      st.param.b32  [param0], [[R]]
 ; CHECK:      .param .align 4 .b8 retval0[4];
@@ -231,7 +231,7 @@ define <3 x i8> @test_v3i8(<3 x i8> %a) {
 ; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
 ; CHECK-LABEL: test_v4i8(
 ; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4]
-; CHECK:      ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_param_0]
+; CHECK:      ld.param.b32 [[R:%r[0-9]+]], [test_v4i8_param_0]
 ; CHECK:      .param .align 4 .b8 param0[4];
 ; CHECK:      st.param.b32  [param0], [[R]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
@@ -248,8 +248,8 @@ define <4 x i8> @test_v4i8(<4 x i8> %a) {
 ; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
 ; CHECK-LABEL: test_v5i8(
 ; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8]
-; CHECK-DAG:  ld.param.u32    [[E0:%r[0-9]+]], [test_v5i8_param_0]
-; CHECK-DAG:  ld.param.u8     [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
+; CHECK-DAG:  ld.param.b32    [[E0:%r[0-9]+]], [test_v5i8_param_0]
+; CHECK-DAG:  ld.param.b8     [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
 ; CHECK:      .param .align 8 .b8 param0[8];
 ; CHECK-DAG:  st.param.v4.b8  [param0], 
 ; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
@@ -269,7 +269,7 @@ define <5 x i8> @test_v5i8(<5 x i8> %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_i11(
 ; CHECK-NEXT: .param .b32 test_i11_param_0
-; CHECK:      ld.param.u16    {{%rs[0-9]+}}, [test_i11_param_0];
+; CHECK:      ld.param.b16    {{%rs[0-9]+}}, [test_i11_param_0];
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
 ; CHECK:      call.uni (retval0),
@@ -285,7 +285,7 @@ define i11 @test_i11(i11 %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_i16(
 ; CHECK-NEXT: .param .b32 test_i16_param_0
-; CHECK:      ld.param.u16    [[E16:%rs[0-9]+]], [test_i16_param_0];
+; CHECK:      ld.param.b16    [[E16:%rs[0-9]+]], [test_i16_param_0];
 ; CHECK:      cvt.u32.u16     [[E32:%r[0-9]+]], [[E16]];
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], [[E32]];
@@ -304,7 +304,7 @@ define i16 @test_i16(i16 %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_i16s(
 ; CHECK-NEXT: .param .b32 test_i16s_param_0
-; CHECK:      ld.param.u16    [[E16:%rs[0-9]+]], [test_i16s_param_0];
+; CHECK:      ld.param.b16    [[E16:%rs[0-9]+]], [test_i16s_param_0];
 ; CHECK:      cvt.s32.s16     [[E32:%r[0-9]+]], [[E16]];
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], [[E32]];
@@ -323,8 +323,8 @@ define signext i16 @test_i16s(i16 signext %a) {
 ; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
 ; CHECK-LABEL: test_v3i16(
 ; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8]
-; CHECK-DAG:  ld.param.u16    [[E2:%rs[0-9]+]], [test_v3i16_param_0+4];
-; CHECK-DAG:  ld.param.u32    [[R:%r[0-9]+]], [test_v3i16_param_0];
+; CHECK-DAG:  ld.param.b16    [[E2:%rs[0-9]+]], [test_v3i16_param_0+4];
+; CHECK-DAG:  ld.param.b32    [[R:%r[0-9]+]], [test_v3i16_param_0];
 ; CHECK-DAG:  mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[R]];
 ; CHECK:      .param .align 8 .b8 param0[8];
 ; CHECK:      st.param.v2.b16 [param0], {[[E0]], [[E1]]};
@@ -345,7 +345,7 @@ define <3 x i16> @test_v3i16(<3 x i16> %a) {
 ; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
 ; CHECK-LABEL: test_v4i16(
 ; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8]
-; CHECK:      ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v4i16_param_0]
+; CHECK:      ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v4i16_param_0]
 ; CHECK:      .param .align 8 .b8 param0[8];
 ; CHECK:      st.param.v2.b32 [param0], {[[E0]], [[E1]]};
 ; CHECK:      .param .align 8 .b8 retval0[8];
@@ -362,8 +362,8 @@ define <4 x i16> @test_v4i16(<4 x i16> %a) {
 ; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
 ; CHECK-LABEL: test_v5i16(
 ; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16]
-; CHECK-DAG:  ld.param.u16    [[E4:%rs[0-9]+]], [test_v5i16_param_0+8];
-; CHECK-DAG:  ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0]
+; CHECK-DAG:  ld.param.b16    [[E4:%rs[0-9]+]], [test_v5i16_param_0+8];
+; CHECK-DAG:  ld.param.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0]
 ; CHECK:      .param .align 16 .b8 param0[16];
 ; CHECK-DAG:  st.param.v4.b16 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
 ; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
@@ -474,7 +474,7 @@ define <3 x half> @test_v3f16(<3 x half> %a) {
 ; CHECK:.func  (.param .align 8 .b8 func_retval0[8])
 ; CHECK-LABEL: test_v4f16(
 ; CHECK:      .param .align 8 .b8 test_v4f16_param_0[8]
-; CHECK:      ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0];
+; CHECK:      ld.param.v2.b32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0];
 ; CHECK:      .param .align 8 .b8 param0[8];
 ; CHECK:      st.param.v2.b32 [param0], {[[R01]], [[R23]]};
 ; CHECK:      .param .align 8 .b8 retval0[8];
@@ -512,7 +512,7 @@ define <5 x half> @test_v5f16(<5 x half> %a) {
 ; CHECK:.func  (.param .align 16 .b8 func_retval0[16])
 ; CHECK-LABEL: test_v8f16(
 ; CHECK:      .param .align 16 .b8 test_v8f16_param_0[16]
-; CHECK:      ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0];
+; CHECK:      ld.param.v4.b32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0];
 ; CHECK:      .param .align 16 .b8 param0[16];
 ; CHECK:      st.param.v4.b32 [param0], {[[R01]], [[R23]], [[R45]], [[R67]]};
 ; CHECK:      .param .align 16 .b8 retval0[16];
@@ -554,8 +554,8 @@ define <9 x half> @test_v9f16(<9 x half> %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_i19(
 ; CHECK-NEXT: .param .b32 test_i19_param_0
-; CHECK-DAG:  ld.param.u16    {{%r[0-9]+}}, [test_i19_param_0];
-; CHECK-DAG:  ld.param.u8     {{%r[0-9]+}}, [test_i19_param_0+2];
+; CHECK-DAG:  ld.param.b16    {{%r[0-9]+}}, [test_i19_param_0];
+; CHECK-DAG:  ld.param.b8     {{%r[0-9]+}}, [test_i19_param_0+2];
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
@@ -572,8 +572,8 @@ define i19 @test_i19(i19 %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_i23(
 ; CHECK-NEXT: .param .b32 test_i23_param_0
-; CHECK-DAG:  ld.param.u16    {{%r[0-9]+}}, [test_i23_param_0];
-; CHECK-DAG:  ld.param.u8     {{%r[0-9]+}}, [test_i23_param_0+2];
+; CHECK-DAG:  ld.param.b16    {{%r[0-9]+}}, [test_i23_param_0];
+; CHECK-DAG:  ld.param.b8     {{%r[0-9]+}}, [test_i23_param_0+2];
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
@@ -590,8 +590,8 @@ define i23 @test_i23(i23 %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_i24(
 ; CHECK-NEXT: .param .b32 test_i24_param_0
-; CHECK-DAG:  ld.param.u8     {{%r[0-9]+}}, [test_i24_param_0+2];
-; CHECK-DAG:  ld.param.u16    {{%r[0-9]+}}, [test_i24_param_0];
+; CHECK-DAG:  ld.param.b8     {{%r[0-9]+}}, [test_i24_param_0+2];
+; CHECK-DAG:  ld.param.b16    {{%r[0-9]+}}, [test_i24_param_0];
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
@@ -608,7 +608,7 @@ define i24 @test_i24(i24 %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_i29(
 ; CHECK-NEXT: .param .b32 test_i29_param_0
-; CHECK:      ld.param.u32    {{%r[0-9]+}}, [test_i29_param_0];
+; CHECK:      ld.param.b32    {{%r[0-9]+}}, [test_i29_param_0];
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], {{%r[0-9]+}};
 ; CHECK:      .param .b32 retval0;
@@ -625,7 +625,7 @@ define i29 @test_i29(i29 %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_i32(
 ; CHECK-NEXT: .param .b32 test_i32_param_0
-; CHECK:      ld.param.u32    [[E:%r[0-9]+]], [test_i32_param_0];
+; CHECK:      ld.param.b32    [[E:%r[0-9]+]], [test_i32_param_0];
 ; CHECK:      .param .b32 param0;
 ; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .b32 retval0;
@@ -642,8 +642,8 @@ define i32 @test_i32(i32 %a) {
 ; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
 ; CHECK-LABEL: test_v3i32(
 ; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16]
-; CHECK-DAG:  ld.param.u32     [[E2:%r[0-9]+]], [test_v3i32_param_0+8];
-; CHECK-DAG:  ld.param.v2.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0];
+; CHECK-DAG:  ld.param.b32     [[E2:%r[0-9]+]], [test_v3i32_param_0+8];
+; CHECK-DAG:  ld.param.v2.b32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0];
 ; CHECK:      .param .align 16 .b8 param0[16];
 ; CHECK:      st.param.v2.b32  [param0], {[[E0]], [[E1]]};
 ; CHECK:      st.param.b32     [param0+8], [[E2]];
@@ -663,7 +663,7 @@ define <3 x i32> @test_v3i32(<3 x i32> %a) {
 ; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
 ; CHECK-LABEL: test_v4i32(
 ; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16]
-; CHECK:      ld.param.v4.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0]
+; CHECK:      ld.param.v4.b32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0]
 ; CHECK:      .param .align 16 .b8 param0[16];
 ; CHECK:      st.param.v4.b32  [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
 ; CHECK:      .param .align 16 .b8 retval0[16];
@@ -680,8 +680,8 @@ define <4 x i32> @test_v4i32(<4 x i32> %a) {
 ; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
 ; CHECK-LABEL: test_v5i32(
 ; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32]
-; CHECK-DAG:  ld.param.u32     [[E4:%r[0-9]+]], [test_v5i32_param_0+16];
-; CHECK-DAG:  ld.param.v4.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0]
+; CHECK-DAG:  ld.param.b32     [[E4:%r[0-9]+]], [test_v5i32_param_0+16];
+; CHECK-DAG:  ld.param.v4.b32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0]
 ; CHECK:      .param .align 32 .b8 param0[32];
 ; CHECK-DAG:  st.param.v4.b32  [param0], {[[E0]], [[E1]], [[E2]], [[E3]]};
 ; CHECK-DAG:  st.param.b32     [param0+16], [[E4]];
@@ -701,14 +701,14 @@ define <5 x i32> @test_v5i32(<5 x i32> %a) {
 ; CHECK: .func  (.param .b32 func_retval0)
 ; CHECK-LABEL: test_f32(
 ; CHECK-NEXT: .param .b32 test_f32_param_0
-; CHECK:      ld.param.f32    [[E:%f[0-9]+]], [test_f32_param_0];
+; CHECK:      ld.param.b32    [[E:%f[0-9]+]], [test_f32_param_0];
 ; CHECK:      .param .b32 param0;
-; CHECK:      st.param.f32    [param0], [[E]];
+; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .b32 retval0;
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT: test_f32,
-; CHECK:      ld.param.f32    [[R:%f[0-9]+]], [retval0];
-; CHECK:      st.param.f32    [func_retval0], [[R]];
+; CHECK:      ld.param.b32    [[R:%f[0-9]+]], [retval0];
+; CHECK:      st.param.b32    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
 define float @test_f32(float %a) {
        %r = tail call float @test_f32(float %a);
@@ -718,8 +718,8 @@ define float @test_f32(float %a) {
 ; CHECK: .func  (.param .b64 func_retval0)
 ; CHECK-LABEL: test_i40(
 ; CHECK-NEXT: .param .b64 test_i40_param_0
-; CHECK-DAG:  ld.param.u8    {{%rd[0-9]+}}, [test_i40_param_0+4];
-; CHECK-DAG:  ld.param.u32   {{%rd[0-9]+}}, [test_i40_param_0];
+; CHECK-DAG:  ld.param.b8    {{%rd[0-9]+}}, [test_i40_param_0+4];
+; CHECK-DAG:  ld.param.b32   {{%rd[0-9]+}}, [test_i40_param_0];
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
@@ -736,8 +736,8 @@ define i40 @test_i40(i40 %a) {
 ; CHECK: .func  (.param .b64 func_retval0)
 ; CHECK-LABEL: test_i47(
 ; CHECK-NEXT: .param .b64 test_i47_param_0
-; CHECK-DAG:  ld.param.u16   {{%rd[0-9]+}}, [test_i47_param_0+4];
-; CHECK-DAG:  ld.param.u32   {{%rd[0-9]+}}, [test_i47_param_0];
+; CHECK-DAG:  ld.param.b16   {{%rd[0-9]+}}, [test_i47_param_0+4];
+; CHECK-DAG:  ld.param.b32   {{%rd[0-9]+}}, [test_i47_param_0];
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
@@ -754,8 +754,8 @@ define i47 @test_i47(i47 %a) {
 ; CHECK: .func  (.param .b64 func_retval0)
 ; CHECK-LABEL: test_i48(
 ; CHECK-NEXT: .param .b64 test_i48_param_0
-; CHECK-DAG:  ld.param.u16   {{%rd[0-9]+}}, [test_i48_param_0+4];
-; CHECK-DAG:  ld.param.u32   {{%rd[0-9]+}}, [test_i48_param_0];
+; CHECK-DAG:  ld.param.b16   {{%rd[0-9]+}}, [test_i48_param_0+4];
+; CHECK-DAG:  ld.param.b32   {{%rd[0-9]+}}, [test_i48_param_0];
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
@@ -772,9 +772,9 @@ define i48 @test_i48(i48 %a) {
 ; CHECK: .func  (.param .b64 func_retval0)
 ; CHECK-LABEL: test_i51(
 ; CHECK-NEXT: .param .b64 test_i51_param_0
-; CHECK-DAG:  ld.param.u8    {{%rd[0-9]+}}, [test_i51_param_0+6];
-; CHECK-DAG:  ld.param.u16   {{%rd[0-9]+}}, [test_i51_param_0+4];
-; CHECK-DAG:  ld.param.u32   {{%rd[0-9]+}}, [test_i51_param_0];
+; CHECK-DAG:  ld.param.b8    {{%rd[0-9]+}}, [test_i51_param_0+6];
+; CHECK-DAG:  ld.param.b16   {{%rd[0-9]+}}, [test_i51_param_0+4];
+; CHECK-DAG:  ld.param.b32   {{%rd[0-9]+}}, [test_i51_param_0];
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
@@ -791,9 +791,9 @@ define i51 @test_i51(i51 %a) {
 ; CHECK: .func  (.param .b64 func_retval0)
 ; CHECK-LABEL: test_i56(
 ; CHECK-NEXT: .param .b64 test_i56_param_0
-; CHECK-DAG:  ld.param.u8    {{%rd[0-9]+}}, [test_i56_param_0+6];
-; CHECK-DAG:  ld.param.u16   {{%rd[0-9]+}}, [test_i56_param_0+4];
-; CHECK-DAG:  ld.param.u32   {{%rd[0-9]+}}, [test_i56_param_0];
+; CHECK-DAG:  ld.param.b8    {{%rd[0-9]+}}, [test_i56_param_0+6];
+; CHECK-DAG:  ld.param.b16   {{%rd[0-9]+}}, [test_i56_param_0+4];
+; CHECK-DAG:  ld.param.b32   {{%rd[0-9]+}}, [test_i56_param_0];
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
@@ -810,7 +810,7 @@ define i56 @test_i56(i56 %a) {
 ; CHECK: .func  (.param .b64 func_retval0)
 ; CHECK-LABEL: test_i57(
 ; CHECK-NEXT: .param .b64 test_i57_param_0
-; CHECK:      ld.param.u64    {{%rd[0-9]+}}, [test_i57_param_0];
+; CHECK:      ld.param.b64    {{%rd[0-9]+}}, [test_i57_param_0];
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], {{%rd[0-9]+}};
 ; CHECK:      .param .b64 retval0;
@@ -827,7 +827,7 @@ define i57 @test_i57(i57 %a) {
 ; CHECK: .func  (.param .b64 func_retval0)
 ; CHECK-LABEL: test_i64(
 ; CHECK-NEXT: .param .b64 test_i64_param_0
-; CHECK:      ld.param.u64    [[E:%rd[0-9]+]], [test_i64_param_0];
+; CHECK:      ld.param.b64    [[E:%rd[0-9]+]], [test_i64_param_0];
 ; CHECK:      .param .b64 param0;
 ; CHECK:      st.param.b64    [param0], [[E]];
 ; CHECK:      .param .b64 retval0;
@@ -844,8 +844,8 @@ define i64 @test_i64(i64 %a) {
 ; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
 ; CHECK-LABEL: test_v3i64(
 ; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32]
-; CHECK-DAG:  ld.param.u64     [[E2:%rd[0-9]+]], [test_v3i64_param_0+16];
-; CHECK-DAG:  ld.param.v2.u64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0];
+; CHECK-DAG:  ld.param.b64     [[E2:%rd[0-9]+]], [test_v3i64_param_0+16];
+; CHECK-DAG:  ld.param.v2.b64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0];
 ; CHECK:      .param .align 32 .b8 param0[32];
 ; CHECK:      st.param.v2.b64  [param0], {[[E0]], [[E1]]};
 ; CHECK:      st.param.b64     [param0+16], [[E2]];
@@ -868,8 +868,8 @@ define <3 x i64> @test_v3i64(<3 x i64> %a) {
 ; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
 ; CHECK-LABEL: test_v4i64(
 ; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32]
-; CHECK-DAG:  ld.param.v2.u64  {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16];
-; CHECK-DAG:  ld.param.v2.u64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0];
+; CHECK-DAG:  ld.param.v2.b64  {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16];
+; CHECK-DAG:  ld.param.v2.b64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0];
 ; CHECK:      .param .align 32 .b8 param0[32];
 ; CHECK:      st.param.v2.b64  [param0], {[[E0]], [[E1]]};
 ; CHECK:      st.param.v2.b64  [param0+16], {[[E2]], [[E3]]};
@@ -891,7 +891,7 @@ define <4 x i64> @test_v4i64(<4 x i64> %a) {
 ; CHECK: .func  (.param .align 1 .b8 func_retval0[1])
 ; CHECK-LABEL: test_s_i1(
 ; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1]
-; CHECK:      ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0];
+; CHECK:      ld.param.b8 [[A:%rs[0-9]+]], [test_s_i1_param_0];
 ; CHECK:      .param .align 1 .b8 param0[1];
 ; CHECK:      st.param.b8    [param0], [[A]]
 ; CHECK:      .param .align 1 .b8 retval0[1];
@@ -908,7 +908,7 @@ define %s_i1 @test_s_i1(%s_i1 %a) {
 ; CHECK: .func  (.param .align 1 .b8 func_retval0[1])
 ; CHECK-LABEL: test_s_i8(
 ; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1]
-; CHECK:      ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0];
+; CHECK:      ld.param.b8 [[A:%rs[0-9]+]], [test_s_i8_param_0];
 ; CHECK:      .param .align 1 .b8 param0[1];
 ; CHECK:      st.param.b8    [param0], [[A]]
 ; CHECK:      .param .align 1 .b8 retval0[1];
@@ -925,7 +925,7 @@ define %s_i8 @test_s_i8(%s_i8 %a) {
 ; CHECK: .func  (.param .align 2 .b8 func_retval0[2])
 ; CHECK-LABEL: test_s_i16(
 ; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2]
-; CHECK:      ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0];
+; CHECK:      ld.param.b16 [[A:%rs[0-9]+]], [test_s_i16_param_0];
 ; CHECK:      .param .align 2 .b8 param0[2];
 ; CHECK:      st.param.b16    [param0], [[A]]
 ; CHECK:      .param .align 2 .b8 retval0[2];
@@ -959,7 +959,7 @@ define %s_f16 @test_s_f16(%s_f16 %a) {
 ; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
 ; CHECK-LABEL: test_s_i32(
 ; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4]
-; CHECK:      ld.param.u32    [[E:%r[0-9]+]], [test_s_i32_param_0];
+; CHECK:      ld.param.b32    [[E:%r[0-9]+]], [test_s_i32_param_0];
 ; CHECK:      .param .align 4 .b8 param0[4]
 ; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
@@ -976,14 +976,14 @@ define %s_i32 @test_s_i32(%s_i32 %a) {
 ; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
 ; CHECK-LABEL: test_s_f32(
 ; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4]
-; CHECK:      ld.param.f32    [[E:%f[0-9]+]], [test_s_f32_param_0];
+; CHECK:      ld.param.b32    [[E:%f[0-9]+]], [test_s_f32_param_0];
 ; CHECK:      .param .align 4 .b8 param0[4]
-; CHECK:      st.param.f32    [param0], [[E]];
+; CHECK:      st.param.b32    [param0], [[E]];
 ; CHECK:      .param .align 4 .b8 retval0[4];
 ; CHECK:      call.uni (retval0),
 ; CHECK-NEXT: test_s_f32,
-; CHECK:      ld.param.f32    [[R:%f[0-9]+]], [retval0];
-; CHECK:      st.param.f32    [func_retval0], [[R]];
+; CHECK:      ld.param.b32    [[R:%f[0-9]+]], [retval0];
+; CHECK:      st.param.b32    [func_retval0], [[R]];
 ; CHECK-NEXT: ret;
 define %s_f32 @test_s_f32(%s_f32 %a) {
        %r = tail call %s_f32 @test_s_f32(%s_f32 %a);
@@ -993,7 +993,7 @@ define %s_f32 @test_s_f32(%s_f32 %a) {
 ; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
 ; CHECK-LABEL: test_s_i64(
 ; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8]
-; CHECK:      ld.param.u64    [[E:%rd[0-9]+]], [test_s_i64_param_0];
+; CHECK:      ld.param.b64    [[E:%rd[0-9]+]], [test_s_i64_param_0];
 ; CHECK:      .param .align 8 .b8 param0[8];
 ; CHECK:      st.param.b64    [param0], [[E]];
 ; CHECK:      .param .align 8 .b8 retval0[8];
@@ -1011,29 +1011,29 @@ define %s_i64 @test_s_i64(%s_i64 %a) {
 ; CHECK: .func  (.param .align 8 .b8 func_retval0[24])
 ; CHECK-LABEL: test_s_i32f32(
 ; CHECK:        .param .align 8 .b8 test_s_i32f32_param_0[24]
-; CHECK-DAG:    ld.param.u64    [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16];
-; CHECK-DAG:    ld.param.f32    [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12];
-; CHECK-DAG:    ld.param.u32    [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8];
-; CHECK-DAG:    ld.param.f32    [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4];
-; CHECK-DAG:    ld.param.u32    [[E0:%r[0-9]+]], [test_s_i32f32_param_0];
+; CHECK-DAG:    ld.param.b64    [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16];
+; CHECK-DAG:    ld.param.b32    [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12];
+; CHECK-DAG:    ld.param.b32    [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8];
+; CHECK-DAG:    ld.param.b32    [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4];
+; CHECK-DAG:    ld.param.b32    [[E0:%r[0-9]+]], [test_s_i32f32_param_0];
 ; CHECK:        .param .align 8 .b8 param0[24];
 ; CHECK-DAG:    st.param.b32    [param0], [[E0]];
-; CHECK-DAG:    st.param.f32    [param0+4], [[E1]];
+; CHECK-DAG:    st.param.b32    [param0+4], [[E1]];
 ; CHECK-DAG:    st.param.b32    [param0+8], [[E2]];
-; CHECK-DAG:    st.param.f32    [param0+12], [[E3]];
+; CHECK-DAG:    st.param.b32    [param0+12], [[E3]];
 ; CHECK-DAG:    st.param.b64    [param0+16], [[E4]];
 ; CHECK:        .param .align 8 .b8 retval0[24];
 ; CHECK:        call.uni (retval0),
 ; CHECK-NEXT:   test_s_i32f32,
 ; CHECK-DAG:    ld.param.b32    [[RE0:%r[0-9]+]], [retval0];
-; CHECK-DAG:    ld.param.f32    [[RE1:%f[0-9]+]], [retval0+4];
+; CHECK-DAG:    ld.param.b32    [[RE1:%f[0-9]+]], [retval0+4];
 ; CHECK-DAG:    ld.param.b32    [[RE2:%r[0-9]+]], [retval0+8];
-; CHECK-DAG:    ld.param.f32    [[RE3:%f[0-9]+]], [retval0+12];
+; CHECK-DAG:    ld.param.b32    [[RE3:%f[0-9]+]], [retval0+12];
 ; CHECK-DAG:    ld.param.b64    [[RE4:%rd[0-9]+]], [retval0+16];
 ; CHECK-DAG:    st.param.b32    [func_retval0], [[RE0]];
-; CHECK-DAG:    st.param.f32    [func_retval0+4], [[RE1]];
+; CHECK-DAG:    st.param.b32    [func_retval0+4], [[RE1]];
 ; CHECK-DAG:    st.param.b32    [func_retval0+8], [[RE2]];
-; CHECK-DAG:    st.param.f32    [func_retval0+12], [[RE3]];
+; CHECK-DAG:    st.param.b32    [func_retval0+12], [[RE3]];
 ; CHECK-DAG:    st.param.b64    [func_retval0+16], [[RE4]];
 ; CHECK:        ret;
 define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) {
@@ -1045,9 +1045,9 @@ define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) {
 ; CHECK:.visible .func  (.param .align 8 .b8 func_retval0[24])
 ; CHECK-LABEL: test_s_i32x4(
 ; CHECK:        .param .align 8 .b8 test_s_i32x4_param_0[24]
-; CHECK-DAG:    ld.param.u64    [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16];
-; CHECK-DAG:    ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8];
-; CHECK-DAG:    ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0];
+; CHECK-DAG:    ld.param.b64    [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16];
+; CHECK-DAG:    ld.param.v2.b32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8];
+; CHECK-DAG:    ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0];
 ; CHECK:        .param .align 8 .b8 param0[24];
 ; CHECK:        st.param.v2.b32 [param0], {[[E0]], [[E1]]};
 ; CHECK:        st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
@@ -1071,11 +1071,11 @@ define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) {
 ; CHECK:.visible .func  (.param .align 8 .b8 func_retval0[32])
 ; CHECK-LABEL: test_s_i1i32x4(
 ; CHECK:        .param .align 8 .b8 test_s_i1i32x4_param_0[32]
-; CHECK:        ld.param.u64    [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24];
-; CHECK:        ld.param.u32    [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16];
-; CHECK:        ld.param.u32    [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12];
-; CHECK:        ld.param.u8     [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8];
-; CHECK:        ld.param.v2.u32         {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0];
+; CHECK:        ld.param.b64    [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24];
+; CHECK:        ld.param.b32    [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16];
+; CHECK:        ld.param.b32    [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12];
+; CHECK:        ld.param.b8     [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8];
+; CHECK:        ld.param.v2.b32         {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0];
 ; CHECK:        .param .align 8 .b8 param0[32];
 ; CHECK:        st.param.v2.b32 [param0], {[[E0]], [[E1]]};
 ; CHECK:        st.param.b8     [param0+8], [[E2]];
@@ -1110,31 +1110,31 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
 ; CHECK:.visible .func  (.param .align 1 .b8 func_retval0[25])
 ; CHECK-LABEL: test_s_i1i32x4p(
 ; CHECK-DAG:        .param .align 1 .b8 test_s_i1i32x4p_param_0[25]
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+24];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+23];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+22];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+21];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+20];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+19];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+18];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+17];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+16];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+15];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+14];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+13];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+12];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+11];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+10];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+9];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+8];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+7];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+6];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+5];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+4];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+3];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+2];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+1];
-; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+24];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+23];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+22];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+21];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+20];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+19];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+18];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+17];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+16];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+15];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+14];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+13];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+12];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+11];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+10];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+9];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+8];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+7];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+6];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+5];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+4];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+3];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+2];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0+1];
+; CHECK-DAG:        ld.param.b8     %r{{.*}}, [test_s_i1i32x4p_param_0];
 ; CHECK:        .param .align 1 .b8 param0[25];
 ; CHECK-DAG:        st.param.b8     [param0],
 ; CHECK-DAG:        st.param.b8     [param0+1],
@@ -1225,12 +1225,12 @@ define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
 ; CHECK:.visible .func  (.param .align 16 .b8 func_retval0[80])
 ; CHECK-LABEL: test_s_crossfield(
 ; CHECK:        .param .align 16 .b8 test_s_crossfield_param_0[80]
-; CHECK:        ld.param.u32    [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64];
-; CHECK:        ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48];
-; CHECK:        ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32];
-; CHECK:        ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16];
-; CHECK:        ld.param.u32    [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8];
-; CHECK:        ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0];
+; CHECK:        ld.param.b32    [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64];
+; CHECK:        ld.param.v4.b32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48];
+; CHECK:        ld.param.v4.b32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32];
+; CHECK:        ld.param.v4.b32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16];
+; CHECK:        ld.param.b32    [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8];
+; CHECK:        ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0];
 ; CHECK:        .param .align 16 .b8 param0[80];
 ; CHECK:        st.param.v2.b32 [param0], {[[E0]], [[E1]]};
 ; CHECK:        st.param.b32    [param0+8], [[E2]];
diff --git a/llvm/test/CodeGen/NVPTX/param-overalign.ll b/llvm/test/CodeGen/NVPTX/param-overalign.ll
index 374475a29ffa1..e3d611865f1f0 100644
--- a/llvm/test/CodeGen/NVPTX/param-overalign.ll
+++ b/llvm/test/CodeGen/NVPTX/param-overalign.ll
@@ -24,20 +24,20 @@ define float @caller_md(float %a, float %b) {
 ; CHECK-NEXT: )
 ; CHECK-NEXT: {
 
-; CHECK:         ld.param.f32 %f1, [caller_md_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [caller_md_param_1];
+; CHECK:         ld.param.b32 %f1, [caller_md_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [caller_md_param_1];
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v2.f32 [param0], {%f1, %f2};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {%f1, %f2};
 ; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    call.uni (retval0),
 ; CHECK-NEXT:    callee_md,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
-; CHECK-NEXT:    ld.param.f32 %f3, [retval0];
+; CHECK-NEXT:    ld.param.b32 %f3, [retval0];
 ; CHECK-NEXT:    }
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-NEXT:    ret;
   %s1 = insertvalue %struct.float2 poison, float %a, 0
   %s2 = insertvalue %struct.float2 %s1, float %b, 1
@@ -51,9 +51,9 @@ define float @callee_md(%struct.float2 alignstack(8) %a) {
 ; CHECK-NEXT: )
 ; CHECK-NEXT: {
 
-; CHECK:         ld.param.v2.f32 {%f1, %f2}, [callee_md_param_0];
+; CHECK:         ld.param.v2.b32 {%f1, %f2}, [callee_md_param_0];
 ; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-NEXT:    ret;
   %v0 = extractvalue %struct.float2 %a, 0
   %v1 = extractvalue %struct.float2 %a, 1
@@ -68,20 +68,20 @@ define float @caller(float %a, float %b) {
 ; CHECK-NEXT: )
 ; CHECK-NEXT: {
 
-; CHECK:         ld.param.f32 %f1, [caller_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [caller_param_1];
+; CHECK:         ld.param.b32 %f1, [caller_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [caller_param_1];
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v2.f32 [param0], {%f1, %f2};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {%f1, %f2};
 ; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    call.uni (retval0),
 ; CHECK-NEXT:    callee,
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
-; CHECK-NEXT:    ld.param.f32 %f3, [retval0];
+; CHECK-NEXT:    ld.param.b32 %f3, [retval0];
 ; CHECK-NEXT:    }
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-NEXT:    ret;
   %s1 = insertvalue %struct.float2 poison, float %a, 0
   %s2 = insertvalue %struct.float2 %s1, float %b, 1
@@ -95,9 +95,9 @@ define float @callee(%struct.float2 alignstack(8) %a ) {
 ; CHECK-NEXT: )
 ; CHECK-NEXT: {
 
-; CHECK:         ld.param.v2.f32 {%f1, %f2}, [callee_param_0];
+; CHECK:         ld.param.v2.b32 {%f1, %f2}, [callee_param_0];
 ; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f3;
 ; CHECK-NEXT:    ret;
   %v0 = extractvalue %struct.float2 %a, 0
   %v1 = extractvalue %struct.float2 %a, 1
diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
index db8b1a6f53d13..abb1aff867754 100644
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
@@ -103,7 +103,7 @@ define internal fastcc [1 x i32] @callee_St4x1(i32 %in.0.val) {
   ; CHECK:       .func  (.param .align 16 .b8 func_retval0[4])
   ; CHECK-LABEL: callee_St4x1(
   ; CHECK-NEXT:  .param .b32 callee_St4x1_param_0
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]], [callee_St4x1_param_0];
+  ; CHECK:       ld.param.b32 [[R1:%r[0-9]+]], [callee_St4x1_param_0];
   ; CHECK:       st.param.b32 [func_retval0], [[R1]];
   ; CHECK-NEXT:  ret;
   %oldret = insertvalue [1 x i32] poison, i32 %in.0.val, 0
@@ -140,7 +140,7 @@ define internal fastcc [2 x i32] @callee_St4x2(ptr nocapture noundef readonly by
   ; CHECK:       .func  (.param .align 16 .b8 func_retval0[8])
   ; CHECK-LABEL: callee_St4x2(
   ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x2_param_0[8]
-  ; CHECK:       ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x2_param_0];
+  ; CHECK:       ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x2_param_0];
   ; CHECK:       st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
   ; CHECK-NEXT:  ret;
   %1 = load i32, ptr %in, align 4
@@ -183,8 +183,8 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by
   ; CHECK:       .func  (.param .align 16 .b8 func_retval0[12])
   ; CHECK-LABEL: callee_St4x3(
   ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x3_param_0[12]
-  ; CHECK:       ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0];
-  ; CHECK:       ld.param.u32    [[R3:%r[0-9]+]],  [callee_St4x3_param_0+8];
+  ; CHECK:       ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0];
+  ; CHECK:       ld.param.b32    [[R3:%r[0-9]+]],  [callee_St4x3_param_0+8];
   ; CHECK:       st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
   ; CHECK:       st.param.b32    [func_retval0+8], [[R3]];
   ; CHECK-NEXT:  ret;
@@ -232,7 +232,7 @@ define internal fastcc [4 x i32] @callee_St4x4(ptr nocapture noundef readonly by
   ; CHECK:       .func  (.param .align 16 .b8 func_retval0[16])
   ; CHECK-LABEL: callee_St4x4(
   ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x4_param_0[16]
-  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_param_0];
+  ; CHECK:       ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_param_0];
   ; CHECK:       st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
   ; CHECK-NEXT:  ret;
   %1 = load i32, ptr %in, align 4
@@ -287,8 +287,8 @@ define internal fastcc [5 x i32] @callee_St4x5(ptr nocapture noundef readonly by
   ; CHECK:       .func  (.param .align 16 .b8 func_retval0[20])
   ; CHECK-LABEL: callee_St4x5(
   ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x5_param_0[20]
-  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x5_param_0];
-  ; CHECK:       ld.param.u32    [[R5:%r[0-9]+]],   [callee_St4x5_param_0+16];
+  ; CHECK:       ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x5_param_0];
+  ; CHECK:       ld.param.b32    [[R5:%r[0-9]+]],   [callee_St4x5_param_0+16];
   ; CHECK:       st.param.v4.b32 [func_retval0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
   ; CHECK:       st.param.b32    [func_retval0+16], [[R5]];
   ; CHECK-NEXT:  ret;
@@ -350,8 +350,8 @@ define internal fastcc [6 x i32] @callee_St4x6(ptr nocapture noundef readonly by
   ; CHECK:       .func  (.param .align 16 .b8 func_retval0[24])
   ; CHECK-LABEL: callee_St4x6(
   ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x6_param_0[24]
-  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x6_param_0];
-  ; CHECK:       ld.param.v2.u32 {[[R5:%r[0-9]+]],  [[R6:%r[0-9]+]]}, [callee_St4x6_param_0+16];
+  ; CHECK:       ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x6_param_0];
+  ; CHECK:       ld.param.v2.b32 {[[R5:%r[0-9]+]],  [[R6:%r[0-9]+]]}, [callee_St4x6_param_0+16];
   ; CHECK:       st.param.v4.b32 [func_retval0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
   ; CHECK:       st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
   ; CHECK-NEXT:  ret;
@@ -421,9 +421,9 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by
   ; CHECK:       .func  (.param .align 16 .b8 func_retval0[28])
   ; CHECK-LABEL: callee_St4x7(
   ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x7_param_0[28]
-  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
-  ; CHECK:       ld.param.v2.u32 {[[R5:%r[0-9]+]],  [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16];
-  ; CHECK:       ld.param.u32    [[R7:%r[0-9]+]],   [callee_St4x7_param_0+24];
+  ; CHECK:       ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
+  ; CHECK:       ld.param.v2.b32 {[[R5:%r[0-9]+]],  [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16];
+  ; CHECK:       ld.param.b32    [[R7:%r[0-9]+]],   [callee_St4x7_param_0+24];
   ; CHECK:       st.param.v4.b32 [func_retval0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
   ; CHECK:       st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
   ; CHECK:       st.param.b32    [func_retval0+24], [[R7]];
@@ -498,8 +498,8 @@ define internal fastcc [8 x i32] @callee_St4x8(ptr nocapture noundef readonly by
   ; CHECK:       .func  (.param .align 16 .b8 func_retval0[32])
   ; CHECK-LABEL: callee_St4x8(
   ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x8_param_0[32]
-  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x8_param_0];
-  ; CHECK:       ld.param.v4.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], [[R8:%r[0-9]+]]}, [callee_St4x8_param_0+16];
+  ; CHECK:       ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x8_param_0];
+  ; CHECK:       ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], [[R8:%r[0-9]+]]}, [callee_St4x8_param_0+16];
   ; CHECK:       st.param.v4.b32 [func_retval0],  {[[R1]], [[R2]], [[R3]], [[R4]]};
   ; CHECK:       st.param.v4.b32 [func_retval0+16], {[[R5]], [[R6]], [[R7]], [[R8]]};
   ; CHECK-NEXT:  ret;
@@ -554,7 +554,7 @@ define internal fastcc [1 x i64] @callee_St8x1(i64 %in.0.val) {
   ; CHECK:       .func  (.param .align 16 .b8 func_retval0[8])
   ; CHECK-LABEL: callee_St8x1(
   ; CHECK-NEXT:  .param .b64 callee_St8x1_param_0
-  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [callee_St8x1_param_0];
+  ; CHECK:       ld.param.b64 [[RD1:%rd[0-9]+]], [callee_St8x1_param_0];
   ; CHECK:       st.param.b64 [func_retval0],  [[RD1]];
   ; CHECK-NEXT:  ret;
   %oldret = insertvalue [1 x i64] poison, i64 %in.0.val, 0
@@ -588,7 +588,7 @@ define internal fastcc [2 x i64] @callee_St8x2(ptr nocapture noundef readonly by
   ; CHECK:       .func  (.param .align 16 .b8 func_retval0[16])
   ; CHECK-LABEL: callee_St8x2(
   ; CHECK-NEXT:  .param .align 16 .b8 callee_St8x2_param_0[16]
-  ; CHECK:       ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x2_param_0];
+  ; CHECK:       ld.param.v2.b64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x2_param_0];
   ; CHECK:       st.param.v2.b64 [func_retval0], {[[RD1]], [[RD2]]};
   ; CHECK-NEXT:  ret;
   %1 = load i64, ptr %in, align 8
@@ -631,8 +631,8 @@ define internal fastcc [3 x i64] @callee_St8x3(ptr nocapture noundef readonly by
   ; CHECK:       .func  (.param .align 16 .b8 func_retval0[24])
   ; CHECK-LABEL: callee_St8x3(
   ; CHECK-NEXT:  .param .align 16 .b8 callee_St8x3_param_0[24]
-  ; CHECK:       ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x3_param_0];
-  ; CHECK:       ld.param.u64    [[RD3:%rd[0-9]+]],  [callee_St8x3_param_0+16];
+  ; CHECK:       ld.param.v2.b64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x3_param_0];
+  ; CHECK:       ld.param.b64    [[RD3:%rd[0-9]+]],  [callee_St8x3_param_0+16];
   ; CHECK:       st.param.v2.b64 [func_retval0],   {[[RD1]], [[RD2]]};
   ; CHECK:       st.param.b64    [func_retval0+16],  [[RD3]];
   ; CHECK-NEXT:  ret;
@@ -682,8 +682,8 @@ define internal fastcc [4 x i64] @callee_St8x4(ptr nocapture noundef readonly by
   ; CHECK:       .func  (.param .align 16 .b8 func_retval0[32])
   ; CHECK-LABEL: callee_St8x4(
   ; CHECK-NEXT:  .param .align 16 .b8 callee_St8x4_param_0[32]
-  ; CHECK:       ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x4_param_0];
-  ; CHECK:       ld.param.v2.u64 {[[RD3:%rd[0-9]+]], [[RD4:%rd[0-9]+]]}, [callee_St8x4_param_0+16];
+  ; CHECK:       ld.param.v2.b64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x4_param_0];
+  ; CHECK:       ld.param.v2.b64 {[[RD3:%rd[0-9]+]], [[RD4:%rd[0-9]+]]}, [callee_St8x4_param_0+16];
   ; CHECK:       st.param.v2.b64 [func_retval0],  {[[RD1]], [[RD2]]};
   ; CHECK:       st.param.v2.b64 [func_retval0+16], {[[RD3]], [[RD4]]};
   ; CHECK-NEXT:  ret;
@@ -707,7 +707,7 @@ define private fastcc [4 x i32] @callee_St4x4_private(ptr nocapture noundef read
   ; CHECK:       .func  (.param .align 16 .b8 func_retval0[16])
   ; CHECK-LABEL: callee_St4x4_private(
   ; CHECK-NEXT:  .param .align 16 .b8 callee_St4x4_private_param_0[16]
-  ; CHECK:       ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_private_param_0];
+  ; CHECK:       ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_private_param_0];
   ; CHECK:       st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
   ; CHECK-NEXT:  ret;
   %1 = load i32, ptr %in, align 4
@@ -731,10 +731,10 @@ define external fastcc [4 x i32] @callee_St4x4_external(ptr nocapture noundef re
   ; CHECK:       .func  (.param .align 4 .b8 func_retval0[16])
   ; CHECK-LABEL: callee_St4x4_external(
   ; CHECK-NEXT:  .param .align 4 .b8 callee_St4x4_external_param_0[16]
-  ; CHECK:       ld.param.u32 [[R1:%r[0-9]+]],   [callee_St4x4_external_param_0];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]],   [callee_St4x4_external_param_0+4];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]],   [callee_St4x4_external_param_0+8];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]],   [callee_St4x4_external_param_0+12];
+  ; CHECK:       ld.param.b32 [[R1:%r[0-9]+]],   [callee_St4x4_external_param_0];
+  ; CHECK:       ld.param.b32 [[R2:%r[0-9]+]],   [callee_St4x4_external_param_0+4];
+  ; CHECK:       ld.param.b32 [[R3:%r[0-9]+]],   [callee_St4x4_external_param_0+8];
+  ; CHECK:       ld.param.b32 [[R4:%r[0-9]+]],   [callee_St4x4_external_param_0+12];
   ; CHECK:       st.param.b32 [func_retval0],  [[R1]];
   ; CHECK:       st.param.b32 [func_retval0+4],  [[R2]];
   ; CHECK:       st.param.b32 [func_retval0+8],  [[R3]];
diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
index 9ca1eddee9d7b..410653805c986 100644
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll
@@ -65,9 +65,9 @@ define dso_local void @foo_St4x1(ptr nocapture noundef readonly byval(%struct.St
   ; CHECK:               .param .align 4 .b8 foo_St4x1_param_0[4],
   ; CHECK:               .param .b64 foo_St4x1_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x1_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x1_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x1_param_1];
+  ; CHECK:       ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x1_param_0];
+  ; CHECK:       st.b32  [[[R1]]], [[R2]];
   ; CHECK:       ret;
   %1 = load i32, ptr %in, align 4
   store i32 %1, ptr %ret, align 4
@@ -79,11 +79,11 @@ define dso_local void @foo_St4x2(ptr nocapture noundef readonly byval(%struct.St
   ; CHECK:               .param .align 4 .b8 foo_St4x2_param_0[8],
   ; CHECK:               .param .b64 foo_St4x2_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x2_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x2_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x2_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x2_param_1];
+  ; CHECK:       ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x2_param_0];
+  ; CHECK:       st.b32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x2_param_0+4];
+  ; CHECK:       st.b32  [[[R1]]+4], [[R3]];
   ; CHECK:       ret;
   %1 = load i32, ptr %in, align 4
   store i32 %1, ptr %ret, align 4
@@ -99,13 +99,13 @@ define dso_local void @foo_St4x3(ptr nocapture noundef readonly byval(%struct.St
   ; CHECK:               .param .align 4 .b8 foo_St4x3_param_0[12],
   ; CHECK:               .param .b64 foo_St4x3_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x3_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x3_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x3_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x3_param_0+8];
-  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
+  ; CHECK:       ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x3_param_1];
+  ; CHECK:       ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x3_param_0];
+  ; CHECK:       st.b32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x3_param_0+4];
+  ; CHECK:       st.b32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ld.param.b32 [[R4:%r[0-9]+]], [foo_St4x3_param_0+8];
+  ; CHECK:       st.b32  [[[R1]]+8], [[R4]];
   ; CHECK:       ret;
   %1 = load i32, ptr %in, align 4
   store i32 %1, ptr %ret, align 4
@@ -125,15 +125,15 @@ define dso_local void @foo_St4x4(ptr nocapture noundef readonly byval(%struct.St
   ; CHECK:               .param .align 4 .b8 foo_St4x4_param_0[16],
   ; CHECK:               .param .b64 foo_St4x4_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x4_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x4_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x4_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x4_param_0+8];
-  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
-  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x4_param_0+12];
-  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
+  ; CHECK:       ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x4_param_1];
+  ; CHECK:       ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x4_param_0];
+  ; CHECK:       st.b32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x4_param_0+4];
+  ; CHECK:       st.b32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ld.param.b32 [[R4:%r[0-9]+]], [foo_St4x4_param_0+8];
+  ; CHECK:       st.b32  [[[R1]]+8], [[R4]];
+  ; CHECK:       ld.param.b32 [[R5:%r[0-9]+]], [foo_St4x4_param_0+12];
+  ; CHECK:       st.b32  [[[R1]]+12], [[R5]];
   ; CHECK:       ret;
   %1 = load i32, ptr %in, align 4
   store i32 %1, ptr %ret, align 4
@@ -157,17 +157,17 @@ define dso_local void @foo_St4x5(ptr nocapture noundef readonly byval(%struct.St
   ; CHECK:               .param .align 4 .b8 foo_St4x5_param_0[20],
   ; CHECK:               .param .b64 foo_St4x5_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x5_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x5_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x5_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x5_param_0+8];
-  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
-  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x5_param_0+12];
-  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
-  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x5_param_0+16];
-  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
+  ; CHECK:       ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x5_param_1];
+  ; CHECK:       ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x5_param_0];
+  ; CHECK:       st.b32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x5_param_0+4];
+  ; CHECK:       st.b32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ld.param.b32 [[R4:%r[0-9]+]], [foo_St4x5_param_0+8];
+  ; CHECK:       st.b32  [[[R1]]+8], [[R4]];
+  ; CHECK:       ld.param.b32 [[R5:%r[0-9]+]], [foo_St4x5_param_0+12];
+  ; CHECK:       st.b32  [[[R1]]+12], [[R5]];
+  ; CHECK:       ld.param.b32 [[R6:%r[0-9]+]], [foo_St4x5_param_0+16];
+  ; CHECK:       st.b32  [[[R1]]+16], [[R6]];
   ; CHECK:       ret;
   %1 = load i32, ptr %in, align 4
   store i32 %1, ptr %ret, align 4
@@ -195,19 +195,19 @@ define dso_local void @foo_St4x6(ptr nocapture noundef readonly byval(%struct.St
   ; CHECK:               .param .align 4 .b8 foo_St4x6_param_0[24],
   ; CHECK:               .param .b64 foo_St4x6_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x6_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x6_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x6_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x6_param_0+8];
-  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
-  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x6_param_0+12];
-  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
-  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x6_param_0+16];
-  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
-  ; CHECK:       ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x6_param_0+20];
-  ; CHECK:       st.u32  [[[R1]]+20], [[R7]];
+  ; CHECK:       ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x6_param_1];
+  ; CHECK:       ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x6_param_0];
+  ; CHECK:       st.b32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x6_param_0+4];
+  ; CHECK:       st.b32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ld.param.b32 [[R4:%r[0-9]+]], [foo_St4x6_param_0+8];
+  ; CHECK:       st.b32  [[[R1]]+8], [[R4]];
+  ; CHECK:       ld.param.b32 [[R5:%r[0-9]+]], [foo_St4x6_param_0+12];
+  ; CHECK:       st.b32  [[[R1]]+12], [[R5]];
+  ; CHECK:       ld.param.b32 [[R6:%r[0-9]+]], [foo_St4x6_param_0+16];
+  ; CHECK:       st.b32  [[[R1]]+16], [[R6]];
+  ; CHECK:       ld.param.b32 [[R7:%r[0-9]+]], [foo_St4x6_param_0+20];
+  ; CHECK:       st.b32  [[[R1]]+20], [[R7]];
   ; CHECK:       ret;
   %1 = load i32, ptr %in, align 4
   store i32 %1, ptr %ret, align 4
@@ -239,21 +239,21 @@ define dso_local void @foo_St4x7(ptr nocapture noundef readonly byval(%struct.St
   ; CHECK:               .param .align 4 .b8 foo_St4x7_param_0[28],
   ; CHECK:               .param .b64 foo_St4x7_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x7_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x7_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x7_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x7_param_0+8];
-  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
-  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x7_param_0+12];
-  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
-  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x7_param_0+16];
-  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
-  ; CHECK:       ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x7_param_0+20];
-  ; CHECK:       st.u32  [[[R1]]+20], [[R7]];
-  ; CHECK:       ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x7_param_0+24];
-  ; CHECK:       st.u32  [[[R1]]+24], [[R8]];
+  ; CHECK:       ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x7_param_1];
+  ; CHECK:       ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x7_param_0];
+  ; CHECK:       st.b32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x7_param_0+4];
+  ; CHECK:       st.b32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ld.param.b32 [[R4:%r[0-9]+]], [foo_St4x7_param_0+8];
+  ; CHECK:       st.b32  [[[R1]]+8], [[R4]];
+  ; CHECK:       ld.param.b32 [[R5:%r[0-9]+]], [foo_St4x7_param_0+12];
+  ; CHECK:       st.b32  [[[R1]]+12], [[R5]];
+  ; CHECK:       ld.param.b32 [[R6:%r[0-9]+]], [foo_St4x7_param_0+16];
+  ; CHECK:       st.b32  [[[R1]]+16], [[R6]];
+  ; CHECK:       ld.param.b32 [[R7:%r[0-9]+]], [foo_St4x7_param_0+20];
+  ; CHECK:       st.b32  [[[R1]]+20], [[R7]];
+  ; CHECK:       ld.param.b32 [[R8:%r[0-9]+]], [foo_St4x7_param_0+24];
+  ; CHECK:       st.b32  [[[R1]]+24], [[R8]];
   ; CHECK:       ret;
   %1 = load i32, ptr %in, align 4
   store i32 %1, ptr %ret, align 4
@@ -289,23 +289,23 @@ define dso_local void @foo_St4x8(ptr nocapture noundef readonly byval(%struct.St
   ; CHECK:               .param .align 4 .b8 foo_St4x8_param_0[32],
   ; CHECK:               .param .b64 foo_St4x8_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x8_param_1];
-  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x8_param_0];
-  ; CHECK:       st.u32  [[[R1]]], [[R2]];
-  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x8_param_0+4];
-  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
-  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x8_param_0+8];
-  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
-  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x8_param_0+12];
-  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
-  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x8_param_0+16];
-  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
-  ; CHECK:       ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x8_param_0+20];
-  ; CHECK:       st.u32  [[[R1]]+20], [[R7]];
-  ; CHECK:       ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x8_param_0+24];
-  ; CHECK:       st.u32  [[[R1]]+24], [[R8]];
-  ; CHECK:       ld.param.u32 [[R9:%r[0-9]+]], [foo_St4x8_param_0+28];
-  ; CHECK:       st.u32  [[[R1]]+28], [[R9]];
+  ; CHECK:       ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x8_param_1];
+  ; CHECK:       ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x8_param_0];
+  ; CHECK:       st.b32  [[[R1]]], [[R2]];
+  ; CHECK:       ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x8_param_0+4];
+  ; CHECK:       st.b32  [[[R1]]+4], [[R3]];
+  ; CHECK:       ld.param.b32 [[R4:%r[0-9]+]], [foo_St4x8_param_0+8];
+  ; CHECK:       st.b32  [[[R1]]+8], [[R4]];
+  ; CHECK:       ld.param.b32 [[R5:%r[0-9]+]], [foo_St4x8_param_0+12];
+  ; CHECK:       st.b32  [[[R1]]+12], [[R5]];
+  ; CHECK:       ld.param.b32 [[R6:%r[0-9]+]], [foo_St4x8_param_0+16];
+  ; CHECK:       st.b32  [[[R1]]+16], [[R6]];
+  ; CHECK:       ld.param.b32 [[R7:%r[0-9]+]], [foo_St4x8_param_0+20];
+  ; CHECK:       st.b32  [[[R1]]+20], [[R7]];
+  ; CHECK:       ld.param.b32 [[R8:%r[0-9]+]], [foo_St4x8_param_0+24];
+  ; CHECK:       st.b32  [[[R1]]+24], [[R8]];
+  ; CHECK:       ld.param.b32 [[R9:%r[0-9]+]], [foo_St4x8_param_0+28];
+  ; CHECK:       st.b32  [[[R1]]+28], [[R9]];
   ; CHECK:       ret;
   %1 = load i32, ptr %in, align 4
   store i32 %1, ptr %ret, align 4
@@ -345,9 +345,9 @@ define dso_local void @foo_St8x1(ptr nocapture noundef readonly byval(%struct.St
   ; CHECK:               .param .align 8 .b8 foo_St8x1_param_0[8],
   ; CHECK:               .param .b64 foo_St8x1_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x1_param_1];
-  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x1_param_0];
-  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
+  ; CHECK:       ld.param.b64 [[R1:%rd[0-9]+]], [foo_St8x1_param_1];
+  ; CHECK:       ld.param.b64 [[RD1:%rd[0-9]+]], [foo_St8x1_param_0];
+  ; CHECK:       st.b64 [[[R1]]], [[RD1]];
   ; CHECK:       ret;
   %1 = load i64, ptr %in, align 8
   store i64 %1, ptr %ret, align 8
@@ -359,11 +359,11 @@ define dso_local void @foo_St8x2(ptr nocapture noundef readonly byval(%struct.St
   ; CHECK:               .param .align 8 .b8 foo_St8x2_param_0[16],
   ; CHECK:               .param .b64 foo_St8x2_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x2_param_1];
-  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x2_param_0];
-  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
-  ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x2_param_0+8];
-  ; CHECK:       st.u64 [[[R1]]+8], [[RD2]];
+  ; CHECK:       ld.param.b64 [[R1:%rd[0-9]+]], [foo_St8x2_param_1];
+  ; CHECK:       ld.param.b64 [[RD1:%rd[0-9]+]], [foo_St8x2_param_0];
+  ; CHECK:       st.b64 [[[R1]]], [[RD1]];
+  ; CHECK:       ld.param.b64 [[RD2:%rd[0-9]+]], [foo_St8x2_param_0+8];
+  ; CHECK:       st.b64 [[[R1]]+8], [[RD2]];
   ; CHECK:       ret;
   %1 = load i64, ptr %in, align 8
   store i64 %1, ptr %ret, align 8
@@ -379,13 +379,13 @@ define dso_local void @foo_St8x3(ptr nocapture noundef readonly byval(%struct.St
   ; CHECK:               .param .align 8 .b8 foo_St8x3_param_0[24],
   ; CHECK:               .param .b64 foo_St8x3_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x3_param_1];
-  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x3_param_0];
-  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
-  ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x3_param_0+8];
-  ; CHECK:       st.u64 [[[R1]]+8], [[RD2]];
-  ; CHECK:       ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x3_param_0+16];
-  ; CHECK:       st.u64 [[[R1]]+16], [[RD3]];
+  ; CHECK:       ld.param.b64 [[R1:%rd[0-9]+]], [foo_St8x3_param_1];
+  ; CHECK:       ld.param.b64 [[RD1:%rd[0-9]+]], [foo_St8x3_param_0];
+  ; CHECK:       st.b64 [[[R1]]], [[RD1]];
+  ; CHECK:       ld.param.b64 [[RD2:%rd[0-9]+]], [foo_St8x3_param_0+8];
+  ; CHECK:       st.b64 [[[R1]]+8], [[RD2]];
+  ; CHECK:       ld.param.b64 [[RD3:%rd[0-9]+]], [foo_St8x3_param_0+16];
+  ; CHECK:       st.b64 [[[R1]]+16], [[RD3]];
   ; CHECK:       ret;
   %1 = load i64, ptr %in, align 8
   store i64 %1, ptr %ret, align 8
@@ -405,15 +405,15 @@ define dso_local void @foo_St8x4(ptr nocapture noundef readonly byval(%struct.St
   ; CHECK:               .param .align 8 .b8 foo_St8x4_param_0[32],
   ; CHECK:               .param .b64 foo_St8x4_param_1
   ; CHECK:       )
-  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x4_param_1];
-  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x4_param_0];
-  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
-  ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x4_param_0+8];
-  ; CHECK:       st.u64 [[[R1]]+8], [[RD2]];
-  ; CHECK:       ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x4_param_0+16];
-  ; CHECK:       st.u64 [[[R1]]+16], [[RD3]];
-  ; CHECK:       ld.param.u64 [[RD4:%rd[0-9]+]], [foo_St8x4_param_0+24];
-  ; CHECK:       st.u64 [[[R1]]+24], [[RD4]];
+  ; CHECK:       ld.param.b64 [[R1:%rd[0-9]+]], [foo_St8x4_param_1];
+  ; CHECK:       ld.param.b64 [[RD1:%rd[0-9]+]], [foo_St8x4_param_0];
+  ; CHECK:       st.b64 [[[R1]]], [[RD1]];
+  ; CHECK:       ld.param.b64 [[RD2:%rd[0-9]+]], [foo_St8x4_param_0+8];
+  ; CHECK:       st.b64 [[[R1]]+8], [[RD2]];
+  ; CHECK:       ld.param.b64 [[RD3:%rd[0-9]+]], [foo_St8x4_param_0+16];
+  ; CHECK:       st.b64 [[[R1]]+16], [[RD3]];
+  ; CHECK:       ld.param.b64 [[RD4:%rd[0-9]+]], [foo_St8x4_param_0+24];
+  ; CHECK:       st.b64 [[[R1]]+24], [[RD4]];
   ; CHECK:       ret;
   %1 = load i64, ptr %in, align 8
   store i64 %1, ptr %ret, align 8
diff --git a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
index 191d4711e8034..b6f1964c54c76 100644
--- a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
+++ b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -5,19 +5,19 @@
 
 define ptx_kernel void @t1(ptr %a) {
 ; PTX32:      mov.b16 %rs{{[0-9]+}}, 0;
-; PTX32-NEXT: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}};
+; PTX32-NEXT: st.global.b8 [%r{{[0-9]+}}], %rs{{[0-9]+}};
 ; PTX64:      mov.b16 %rs{{[0-9]+}}, 0;
-; PTX64-NEXT: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}};
+; PTX64-NEXT: st.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}};
   store i1 false, ptr %a
   ret void
 }
 
 
 define ptx_kernel void @t2(ptr %a, ptr %b) {
-; PTX32: ld.global.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.global.b8 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 ; PTX32: setp.ne.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 0;
-; PTX64: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; PTX64: ld.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 ; PTX64: setp.ne.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 0;
 
diff --git a/llvm/test/CodeGen/NVPTX/pr16278.ll b/llvm/test/CodeGen/NVPTX/pr16278.ll
index ad832dcde35a2..508786be03030 100644
--- a/llvm/test/CodeGen/NVPTX/pr16278.ll
+++ b/llvm/test/CodeGen/NVPTX/pr16278.ll
@@ -4,7 +4,7 @@
 @one_f = addrspace(4) global float 1.000000e+00, align 4
 
 define float @foo() {
-; CHECK: ld.const.f32
+; CHECK: ld.const.b32
   %val = load float, ptr addrspace(4) @one_f
   ret float %val
 }
diff --git a/llvm/test/CodeGen/NVPTX/prefetch.ll b/llvm/test/CodeGen/NVPTX/prefetch.ll
index 68512bfac7a29..a64e4fe7a508e 100644
--- a/llvm/test/CodeGen/NVPTX/prefetch.ll
+++ b/llvm/test/CodeGen/NVPTX/prefetch.ll
@@ -23,7 +23,7 @@ define void @prefetch_local(ptr addrspace(5) %local_ptr) {
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [prefetch_local_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [prefetch_local_param_0];
 ; CHECK-PTX64-NEXT:    prefetch.local.L1 [%rd1];
 ; CHECK-PTX64-NEXT:    prefetch.local.L2 [%rd1];
 ; CHECK-PTX64-NEXT:    ret;
@@ -38,7 +38,7 @@ define void @prefetch_global(ptr addrspace(1) %global_ptr) {
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [prefetch_global_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [prefetch_global_param_0];
 ; CHECK-PTX64-NEXT:    prefetch.global.L1 [%rd1];
 ; CHECK-PTX64-NEXT:    prefetch.global.L2 [%rd1];
 ; CHECK-PTX64-NEXT:    prefetch.global.L2::evict_normal [%rd1];
@@ -58,7 +58,7 @@ define void @prefetch_(ptr %ptr) {
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [prefetch__param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [prefetch__param_0];
 ; CHECK-PTX64-NEXT:    prefetch.L1 [%rd1];
 ; CHECK-PTX64-NEXT:    prefetch.L2 [%rd1];
 ; CHECK-PTX64-NEXT:    ret;
@@ -73,7 +73,7 @@ define void @prefetchu_l1(ptr %ptr) {
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [prefetchu_l1_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [prefetchu_l1_param_0];
 ; CHECK-PTX64-NEXT:    prefetchu.L1 [%rd1];
 ; CHECK-PTX64-NEXT:    ret;
   tail call void @llvm.nvvm.prefetchu.L1(ptr %ptr)
diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll
index b95a3287474c4..c39716bef4d71 100644
--- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll
+++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll
@@ -107,12 +107,12 @@ declare float @callee_f32()
 define  float @check_f32() {
   ; PTX-LABEL: check_f32
   ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}}
-  ; PTX-DAG: ld.param.f32 [[LD:%f[0-9]+]], [retval0];
+  ; PTX-DAG: ld.param.b32 [[LD:%f[0-9]+]], [retval0];
   ; PTX-DAG: } // callseq {{[0-9]+}}
 
   ; PTX-WITHOUT-DAG: mov.b32 [[PROXY:%f[0-9]+]], [[LD]];
-  ; PTX-WITHOUT-DAG: st.param.f32 [func_retval0], [[PROXY]];
-  ; PTX-WITH-DAG:    st.param.f32 [func_retval0], [[LD]];
+  ; PTX-WITHOUT-DAG: st.param.b32 [func_retval0], [[PROXY]];
+  ; PTX-WITH-DAG:    st.param.b32 [func_retval0], [[LD]];
 
   %ret = call float @callee_f32()
   ret float %ret
@@ -122,12 +122,12 @@ declare double @callee_f64()
 define  double @check_f64() {
   ; PTX-LABEL: check_f64
   ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}}
-  ; PTX-DAG: ld.param.f64 [[LD:%fd[0-9]+]], [retval0];
+  ; PTX-DAG: ld.param.b64 [[LD:%fd[0-9]+]], [retval0];
   ; PTX-DAG: } // callseq {{[0-9]+}}
 
   ; PTX-WITHOUT-DAG: mov.b64 [[PROXY:%fd[0-9]+]], [[LD]];
-  ; PTX-WITHOUT-DAG: st.param.f64 [func_retval0], [[PROXY]];
-  ; PTX-WITH-DAG:    st.param.f64 [func_retval0], [[LD]];
+  ; PTX-WITHOUT-DAG: st.param.b64 [func_retval0], [[PROXY]];
+  ; PTX-WITH-DAG:    st.param.b64 [func_retval0], [[LD]];
 
   %ret = call double @callee_f64()
   ret double %ret
@@ -170,13 +170,13 @@ declare <2 x double> @callee_vec_f64()
 define  <2 x double> @check_vec_f64() {
   ; PTX-LABEL: check_vec_f64
   ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}}
-  ; PTX-DAG: ld.param.v2.f64 {[[LD0:%fd[0-9]+]], [[LD1:%fd[0-9]+]]}, [retval0];
+  ; PTX-DAG: ld.param.v2.b64 {[[LD0:%fd[0-9]+]], [[LD1:%fd[0-9]+]]}, [retval0];
   ; PTX-DAG: } // callseq {{[0-9]+}}
 
   ; PTX-WITHOUT-DAG: mov.b64 [[PROXY0:%fd[0-9]+]], [[LD0]];
   ; PTX-WITHOUT-DAG: mov.b64 [[PROXY1:%fd[0-9]+]], [[LD1]];
-  ; PTX-WITHOUT-DAG: st.param.v2.f64 [func_retval0], {[[PROXY0]], [[PROXY1]]};
-  ; PTX-WITH-DAG:    st.param.v2.f64 [func_retval0], {[[LD0]], [[LD1]]};
+  ; PTX-WITHOUT-DAG: st.param.v2.b64 [func_retval0], {[[PROXY0]], [[PROXY1]]};
+  ; PTX-WITH-DAG:    st.param.v2.b64 [func_retval0], {[[LD0]], [[LD1]]};
 
   %ret = call <2 x double> @callee_vec_f64()
   ret <2 x double> %ret
diff --git a/llvm/test/CodeGen/NVPTX/rcp-opt.ll b/llvm/test/CodeGen/NVPTX/rcp-opt.ll
index 0b020b7751387..e0ef5baf21bfa 100644
--- a/llvm/test/CodeGen/NVPTX/rcp-opt.ll
+++ b/llvm/test/CodeGen/NVPTX/rcp-opt.ll
@@ -12,10 +12,10 @@ define double @test1(double %in) {
 ; CHECK-NEXT:    .reg .b64 %fd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [test1_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [test1_param_0];
 ; CHECK-NEXT:    rcp.rn.f64 %fd2, %fd1;
 ; CHECK-NEXT:    neg.f64 %fd3, %fd2;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd3;
 ; CHECK-NEXT:    ret;
   %div = fdiv double 1.000000e+00, %in
   %neg = fsub double -0.000000e+00, %div
@@ -30,10 +30,10 @@ define double @test2(double %in) {
 ; CHECK-NEXT:    .reg .b64 %fd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [test2_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [test2_param_0];
 ; CHECK-NEXT:    rcp.rn.f64 %fd2, %fd1;
 ; CHECK-NEXT:    neg.f64 %fd3, %fd2;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd3;
 ; CHECK-NEXT:    ret;
   %div = fdiv double -1.000000e+00, %in
   ret double %div
@@ -47,10 +47,10 @@ define double @test3(double %in) {
 ; CHECK-NEXT:    .reg .b64 %fd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [test3_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [test3_param_0];
 ; CHECK-NEXT:    rcp.rn.f64 %fd2, %fd1;
 ; CHECK-NEXT:    neg.f64 %fd3, %fd2;
-; CHECK-NEXT:    st.param.f64 [func_retval0], %fd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %fd3;
 ; CHECK-NEXT:    ret;
   %neg = fsub double -0.000000e+00, %in
   %div = fdiv double 1.000000e+00, %neg
diff --git a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
index 1d14be9070b07..aa463b510fe84 100644
--- a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
+++ b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
@@ -10,21 +10,21 @@
 
 ; CHECK-LABEL: test_gv_float()
 define float @test_gv_float() {
-; CHECK: ld.global.nc.f32
+; CHECK: ld.global.nc.b32
   %v = load float, ptr @gv_float
   ret float %v
 }
 
 ; CHECK-LABEL: test_gv_float2()
 define <2 x float> @test_gv_float2() {
-; CHECK: ld.global.nc.v2.f32
+; CHECK: ld.global.nc.v2.b32
   %v = load <2 x float>, ptr @gv_float2
   ret <2 x float> %v
 }
 
 ; CHECK-LABEL: test_gv_float4()
 define <4 x float> @test_gv_float4() {
-; CHECK: ld.global.nc.v4.f32
+; CHECK: ld.global.nc.v4.b32
   %v = load <4 x float>, ptr @gv_float4
   ret <4 x float> %v
 }
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
index 020a61a1675aa..180b90ff90a7b 100644
--- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -22,7 +22,7 @@ define half @reduce_fadd_half(<8 x half> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    mov.b16 %rs3, 0x0000;
 ; CHECK-NEXT:    add.rn.f16 %rs4, %rs1, %rs3;
@@ -49,7 +49,7 @@ define half @reduce_fadd_half_reassoc(<8 x half> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<10>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0];
 ; CHECK-SM80-NEXT:    add.rn.f16x2 %r5, %r2, %r4;
 ; CHECK-SM80-NEXT:    add.rn.f16x2 %r6, %r1, %r3;
 ; CHECK-SM80-NEXT:    add.rn.f16x2 %r7, %r6, %r5;
@@ -69,7 +69,7 @@ define half @reduce_fadd_half_reassoc(<8 x half> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<10>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0];
 ; CHECK-SM100-NEXT:    add.rn.f16x2 %r5, %r2, %r4;
 ; CHECK-SM100-NEXT:    add.rn.f16x2 %r6, %r1, %r3;
 ; CHECK-SM100-NEXT:    add.rn.f16x2 %r7, %r6, %r5;
@@ -118,8 +118,8 @@ define float @reduce_fadd_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<17>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_param_0];
 ; CHECK-NEXT:    add.rn.f32 %f9, %f1, 0f00000000;
 ; CHECK-NEXT:    add.rn.f32 %f10, %f9, %f2;
 ; CHECK-NEXT:    add.rn.f32 %f11, %f10, %f3;
@@ -128,7 +128,7 @@ define float @reduce_fadd_float(<8 x float> %in) {
 ; CHECK-NEXT:    add.rn.f32 %f14, %f13, %f6;
 ; CHECK-NEXT:    add.rn.f32 %f15, %f14, %f7;
 ; CHECK-NEXT:    add.rn.f32 %f16, %f15, %f8;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f16;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f16;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
   ret float %res
@@ -140,8 +140,8 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<17>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_param_0];
 ; CHECK-NEXT:    add.rn.f32 %f9, %f3, %f7;
 ; CHECK-NEXT:    add.rn.f32 %f10, %f1, %f5;
 ; CHECK-NEXT:    add.rn.f32 %f11, %f4, %f8;
@@ -150,7 +150,7 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    add.rn.f32 %f14, %f10, %f9;
 ; CHECK-NEXT:    add.rn.f32 %f15, %f14, %f13;
 ; CHECK-NEXT:    add.rn.f32 %f16, %f15, 0f00000000;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f16;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f16;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
   ret float %res
@@ -162,9 +162,9 @@ define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f7, [reduce_fadd_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.f32 {%f5, %f6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %f7, [reduce_fadd_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%f5, %f6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_nonpow2_param_0];
 ; CHECK-NEXT:    add.rn.f32 %f8, %f3, %f7;
 ; CHECK-NEXT:    add.rn.f32 %f9, %f1, %f5;
 ; CHECK-NEXT:    add.rn.f32 %f10, %f9, %f8;
@@ -172,7 +172,7 @@ define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) {
 ; CHECK-NEXT:    add.rn.f32 %f12, %f11, %f4;
 ; CHECK-NEXT:    add.rn.f32 %f13, %f10, %f12;
 ; CHECK-NEXT:    add.rn.f32 %f14, %f13, 0f00000000;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f14;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f14;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <7 x float> %in)
   ret float %res
@@ -186,7 +186,7 @@ define half @reduce_fmul_half(<8 x half> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    mul.rn.f16 %rs3, %rs1, %rs2;
 ; CHECK-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
@@ -211,7 +211,7 @@ define half @reduce_fmul_half_reassoc(<8 x half> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<10>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0];
 ; CHECK-SM80-NEXT:    mul.rn.f16x2 %r5, %r2, %r4;
 ; CHECK-SM80-NEXT:    mul.rn.f16x2 %r6, %r1, %r3;
 ; CHECK-SM80-NEXT:    mul.rn.f16x2 %r7, %r6, %r5;
@@ -229,7 +229,7 @@ define half @reduce_fmul_half_reassoc(<8 x half> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<10>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0];
 ; CHECK-SM100-NEXT:    mul.rn.f16x2 %r5, %r2, %r4;
 ; CHECK-SM100-NEXT:    mul.rn.f16x2 %r6, %r1, %r3;
 ; CHECK-SM100-NEXT:    mul.rn.f16x2 %r7, %r6, %r5;
@@ -277,8 +277,8 @@ define float @reduce_fmul_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_param_0];
 ; CHECK-NEXT:    mul.rn.f32 %f9, %f1, %f2;
 ; CHECK-NEXT:    mul.rn.f32 %f10, %f9, %f3;
 ; CHECK-NEXT:    mul.rn.f32 %f11, %f10, %f4;
@@ -286,7 +286,7 @@ define float @reduce_fmul_float(<8 x float> %in) {
 ; CHECK-NEXT:    mul.rn.f32 %f13, %f12, %f6;
 ; CHECK-NEXT:    mul.rn.f32 %f14, %f13, %f7;
 ; CHECK-NEXT:    mul.rn.f32 %f15, %f14, %f8;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
   ret float %res
@@ -298,8 +298,8 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_param_0];
 ; CHECK-NEXT:    mul.rn.f32 %f9, %f3, %f7;
 ; CHECK-NEXT:    mul.rn.f32 %f10, %f1, %f5;
 ; CHECK-NEXT:    mul.rn.f32 %f11, %f4, %f8;
@@ -307,7 +307,7 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    mul.rn.f32 %f13, %f12, %f11;
 ; CHECK-NEXT:    mul.rn.f32 %f14, %f10, %f9;
 ; CHECK-NEXT:    mul.rn.f32 %f15, %f14, %f13;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
   ret float %res
@@ -319,16 +319,16 @@ define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f7, [reduce_fmul_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.f32 {%f5, %f6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %f7, [reduce_fmul_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%f5, %f6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_nonpow2_param_0];
 ; CHECK-NEXT:    mul.rn.f32 %f8, %f3, %f7;
 ; CHECK-NEXT:    mul.rn.f32 %f9, %f1, %f5;
 ; CHECK-NEXT:    mul.rn.f32 %f10, %f9, %f8;
 ; CHECK-NEXT:    mul.rn.f32 %f11, %f2, %f6;
 ; CHECK-NEXT:    mul.rn.f32 %f12, %f11, %f4;
 ; CHECK-NEXT:    mul.rn.f32 %f13, %f10, %f12;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f13;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f13;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <7 x float> %in)
   ret float %res
@@ -342,7 +342,7 @@ define half @reduce_fmax_half(<8 x half> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_param_0];
 ; CHECK-NEXT:    max.f16x2 %r5, %r2, %r4;
 ; CHECK-NEXT:    max.f16x2 %r6, %r1, %r3;
 ; CHECK-NEXT:    max.f16x2 %r7, %r6, %r5;
@@ -361,7 +361,7 @@ define half @reduce_fmax_half_reassoc(<8 x half> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_reassoc_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_reassoc_param_0];
 ; CHECK-NEXT:    max.f16x2 %r5, %r2, %r4;
 ; CHECK-NEXT:    max.f16x2 %r6, %r1, %r3;
 ; CHECK-NEXT:    max.f16x2 %r7, %r6, %r5;
@@ -407,8 +407,8 @@ define float @reduce_fmax_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_param_0];
 ; CHECK-NEXT:    max.f32 %f9, %f4, %f8;
 ; CHECK-NEXT:    max.f32 %f10, %f2, %f6;
 ; CHECK-NEXT:    max.f32 %f11, %f10, %f9;
@@ -416,7 +416,7 @@ define float @reduce_fmax_float(<8 x float> %in) {
 ; CHECK-NEXT:    max.f32 %f13, %f1, %f5;
 ; CHECK-NEXT:    max.f32 %f14, %f13, %f12;
 ; CHECK-NEXT:    max.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fmax(<8 x float> %in)
   ret float %res
@@ -429,8 +429,8 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_param_0];
 ; CHECK-NEXT:    max.f32 %f9, %f4, %f8;
 ; CHECK-NEXT:    max.f32 %f10, %f2, %f6;
 ; CHECK-NEXT:    max.f32 %f11, %f10, %f9;
@@ -438,7 +438,7 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    max.f32 %f13, %f1, %f5;
 ; CHECK-NEXT:    max.f32 %f14, %f13, %f12;
 ; CHECK-NEXT:    max.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmax(<8 x float> %in)
   ret float %res
@@ -451,16 +451,16 @@ define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f7, [reduce_fmax_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.f32 {%f5, %f6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %f7, [reduce_fmax_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%f5, %f6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_nonpow2_param_0];
 ; CHECK-NEXT:    max.f32 %f8, %f3, %f7;
 ; CHECK-NEXT:    max.f32 %f9, %f1, %f5;
 ; CHECK-NEXT:    max.f32 %f10, %f9, %f8;
 ; CHECK-NEXT:    max.f32 %f11, %f2, %f6;
 ; CHECK-NEXT:    max.f32 %f12, %f11, %f4;
 ; CHECK-NEXT:    max.f32 %f13, %f10, %f12;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f13;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f13;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmax(<7 x float> %in)
   ret float %res
@@ -474,7 +474,7 @@ define half @reduce_fmin_half(<8 x half> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_param_0];
 ; CHECK-NEXT:    min.f16x2 %r5, %r2, %r4;
 ; CHECK-NEXT:    min.f16x2 %r6, %r1, %r3;
 ; CHECK-NEXT:    min.f16x2 %r7, %r6, %r5;
@@ -493,7 +493,7 @@ define half @reduce_fmin_half_reassoc(<8 x half> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_reassoc_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_reassoc_param_0];
 ; CHECK-NEXT:    min.f16x2 %r5, %r2, %r4;
 ; CHECK-NEXT:    min.f16x2 %r6, %r1, %r3;
 ; CHECK-NEXT:    min.f16x2 %r7, %r6, %r5;
@@ -539,8 +539,8 @@ define float @reduce_fmin_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_param_0];
 ; CHECK-NEXT:    min.f32 %f9, %f4, %f8;
 ; CHECK-NEXT:    min.f32 %f10, %f2, %f6;
 ; CHECK-NEXT:    min.f32 %f11, %f10, %f9;
@@ -548,7 +548,7 @@ define float @reduce_fmin_float(<8 x float> %in) {
 ; CHECK-NEXT:    min.f32 %f13, %f1, %f5;
 ; CHECK-NEXT:    min.f32 %f14, %f13, %f12;
 ; CHECK-NEXT:    min.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fmin(<8 x float> %in)
   ret float %res
@@ -561,8 +561,8 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_param_0];
 ; CHECK-NEXT:    min.f32 %f9, %f4, %f8;
 ; CHECK-NEXT:    min.f32 %f10, %f2, %f6;
 ; CHECK-NEXT:    min.f32 %f11, %f10, %f9;
@@ -570,7 +570,7 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    min.f32 %f13, %f1, %f5;
 ; CHECK-NEXT:    min.f32 %f14, %f13, %f12;
 ; CHECK-NEXT:    min.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmin(<8 x float> %in)
   ret float %res
@@ -583,16 +583,16 @@ define float @reduce_fmin_float_reassoc_nonpow2(<7 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f7, [reduce_fmin_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.f32 {%f5, %f6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %f7, [reduce_fmin_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%f5, %f6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_nonpow2_param_0];
 ; CHECK-NEXT:    min.f32 %f8, %f3, %f7;
 ; CHECK-NEXT:    min.f32 %f9, %f1, %f5;
 ; CHECK-NEXT:    min.f32 %f10, %f9, %f8;
 ; CHECK-NEXT:    min.f32 %f11, %f2, %f6;
 ; CHECK-NEXT:    min.f32 %f12, %f11, %f4;
 ; CHECK-NEXT:    min.f32 %f13, %f10, %f12;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f13;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f13;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmin(<7 x float> %in)
   ret float %res
@@ -606,7 +606,7 @@ define half @reduce_fmaximum_half(<8 x half> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_half_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_half_param_0];
 ; CHECK-NEXT:    max.NaN.f16x2 %r5, %r2, %r4;
 ; CHECK-NEXT:    max.NaN.f16x2 %r6, %r1, %r3;
 ; CHECK-NEXT:    max.NaN.f16x2 %r7, %r6, %r5;
@@ -625,7 +625,7 @@ define half @reduce_fmaximum_half_reassoc(<8 x half> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_half_reassoc_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_half_reassoc_param_0];
 ; CHECK-NEXT:    max.NaN.f16x2 %r5, %r2, %r4;
 ; CHECK-NEXT:    max.NaN.f16x2 %r6, %r1, %r3;
 ; CHECK-NEXT:    max.NaN.f16x2 %r7, %r6, %r5;
@@ -671,8 +671,8 @@ define float @reduce_fmaximum_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_param_0];
 ; CHECK-NEXT:    max.NaN.f32 %f9, %f4, %f8;
 ; CHECK-NEXT:    max.NaN.f32 %f10, %f2, %f6;
 ; CHECK-NEXT:    max.NaN.f32 %f11, %f10, %f9;
@@ -680,7 +680,7 @@ define float @reduce_fmaximum_float(<8 x float> %in) {
 ; CHECK-NEXT:    max.NaN.f32 %f13, %f1, %f5;
 ; CHECK-NEXT:    max.NaN.f32 %f14, %f13, %f12;
 ; CHECK-NEXT:    max.NaN.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fmaximum(<8 x float> %in)
   ret float %res
@@ -693,8 +693,8 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_param_0];
 ; CHECK-NEXT:    max.NaN.f32 %f9, %f4, %f8;
 ; CHECK-NEXT:    max.NaN.f32 %f10, %f2, %f6;
 ; CHECK-NEXT:    max.NaN.f32 %f11, %f10, %f9;
@@ -702,7 +702,7 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    max.NaN.f32 %f13, %f1, %f5;
 ; CHECK-NEXT:    max.NaN.f32 %f14, %f13, %f12;
 ; CHECK-NEXT:    max.NaN.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmaximum(<8 x float> %in)
   ret float %res
@@ -715,16 +715,16 @@ define float @reduce_fmaximum_float_reassoc_nonpow2(<7 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.f32 {%f5, %f6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %f7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%f5, %f6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0];
 ; CHECK-NEXT:    max.NaN.f32 %f8, %f3, %f7;
 ; CHECK-NEXT:    max.NaN.f32 %f9, %f1, %f5;
 ; CHECK-NEXT:    max.NaN.f32 %f10, %f9, %f8;
 ; CHECK-NEXT:    max.NaN.f32 %f11, %f2, %f6;
 ; CHECK-NEXT:    max.NaN.f32 %f12, %f11, %f4;
 ; CHECK-NEXT:    max.NaN.f32 %f13, %f10, %f12;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f13;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f13;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fmaximum(<7 x float> %in)
   ret float %res
@@ -738,7 +738,7 @@ define half @reduce_fminimum_half(<8 x half> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_half_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_half_param_0];
 ; CHECK-NEXT:    min.NaN.f16x2 %r5, %r2, %r4;
 ; CHECK-NEXT:    min.NaN.f16x2 %r6, %r1, %r3;
 ; CHECK-NEXT:    min.NaN.f16x2 %r7, %r6, %r5;
@@ -757,7 +757,7 @@ define half @reduce_fminimum_half_reassoc(<8 x half> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_half_reassoc_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_half_reassoc_param_0];
 ; CHECK-NEXT:    min.NaN.f16x2 %r5, %r2, %r4;
 ; CHECK-NEXT:    min.NaN.f16x2 %r6, %r1, %r3;
 ; CHECK-NEXT:    min.NaN.f16x2 %r7, %r6, %r5;
@@ -803,8 +803,8 @@ define float @reduce_fminimum_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_param_0];
 ; CHECK-NEXT:    min.NaN.f32 %f9, %f4, %f8;
 ; CHECK-NEXT:    min.NaN.f32 %f10, %f2, %f6;
 ; CHECK-NEXT:    min.NaN.f32 %f11, %f10, %f9;
@@ -812,7 +812,7 @@ define float @reduce_fminimum_float(<8 x float> %in) {
 ; CHECK-NEXT:    min.NaN.f32 %f13, %f1, %f5;
 ; CHECK-NEXT:    min.NaN.f32 %f14, %f13, %f12;
 ; CHECK-NEXT:    min.NaN.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fminimum(<8 x float> %in)
   ret float %res
@@ -825,8 +825,8 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_param_0];
 ; CHECK-NEXT:    min.NaN.f32 %f9, %f4, %f8;
 ; CHECK-NEXT:    min.NaN.f32 %f10, %f2, %f6;
 ; CHECK-NEXT:    min.NaN.f32 %f11, %f10, %f9;
@@ -834,7 +834,7 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    min.NaN.f32 %f13, %f1, %f5;
 ; CHECK-NEXT:    min.NaN.f32 %f14, %f13, %f12;
 ; CHECK-NEXT:    min.NaN.f32 %f15, %f14, %f11;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f15;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f15;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fminimum(<8 x float> %in)
   ret float %res
@@ -847,16 +847,16 @@ define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) {
 ; CHECK-NEXT:    .reg .b32 %f<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.f32 {%f5, %f6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %f7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%f5, %f6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_nonpow2_param_0];
 ; CHECK-NEXT:    min.NaN.f32 %f8, %f3, %f7;
 ; CHECK-NEXT:    min.NaN.f32 %f9, %f1, %f5;
 ; CHECK-NEXT:    min.NaN.f32 %f10, %f9, %f8;
 ; CHECK-NEXT:    min.NaN.f32 %f11, %f2, %f6;
 ; CHECK-NEXT:    min.NaN.f32 %f12, %f11, %f4;
 ; CHECK-NEXT:    min.NaN.f32 %f13, %f10, %f12;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f13;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f13;
 ; CHECK-NEXT:    ret;
   %res = call reassoc float @llvm.vector.reduce.fminimum(<7 x float> %in)
   ret float %res
@@ -869,7 +869,7 @@ define i16 @reduce_add_i16(<8 x i16> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<6>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0];
 ; CHECK-SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
 ; CHECK-SM80-NEXT:    add.s16 %rs5, %rs3, %rs1;
@@ -891,7 +891,7 @@ define i16 @reduce_add_i16(<8 x i16> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<11>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0];
 ; CHECK-SM100-NEXT:    add.s16x2 %r5, %r2, %r4;
 ; CHECK-SM100-NEXT:    add.s16x2 %r6, %r1, %r3;
 ; CHECK-SM100-NEXT:    add.s16x2 %r7, %r6, %r5;
@@ -914,10 +914,10 @@ define i16 @reduce_add_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<3>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.u32 %r1, [reduce_add_i16_nonpow2_param_0+8];
+; CHECK-SM80-NEXT:    ld.param.b32 %r1, [reduce_add_i16_nonpow2_param_0+8];
 ; CHECK-SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-NEXT:    ld.param.u16 %rs7, [reduce_add_i16_nonpow2_param_0+12];
-; CHECK-SM80-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0];
+; CHECK-SM80-NEXT:    ld.param.b16 %rs7, [reduce_add_i16_nonpow2_param_0+12];
+; CHECK-SM80-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0];
 ; CHECK-SM80-NEXT:    add.s16 %rs8, %rs3, %rs7;
 ; CHECK-SM80-NEXT:    add.s16 %rs9, %rs1, %rs5;
 ; CHECK-SM80-NEXT:    add.s16 %rs10, %rs9, %rs8;
@@ -934,12 +934,12 @@ define i16 @reduce_add_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<9>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.u32 %r1, [reduce_add_i16_nonpow2_param_0+8];
+; CHECK-SM100-NEXT:    ld.param.b32 %r1, [reduce_add_i16_nonpow2_param_0+8];
 ; CHECK-SM100-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0];
 ; CHECK-SM100-NEXT:    mov.b32 %r2, {%rs1, %rs2};
 ; CHECK-SM100-NEXT:    mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-NEXT:    ld.param.u16 %rs7, [reduce_add_i16_nonpow2_param_0+12];
+; CHECK-SM100-NEXT:    ld.param.b16 %rs7, [reduce_add_i16_nonpow2_param_0+12];
 ; CHECK-SM100-NEXT:    mov.b16 %rs8, 0;
 ; CHECK-SM100-NEXT:    mov.b32 %r4, {%rs7, %rs8};
 ; CHECK-SM100-NEXT:    add.s16x2 %r5, %r3, %r4;
@@ -960,8 +960,8 @@ define i32 @reduce_add_i32(<8 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0];
 ; CHECK-NEXT:    add.s32 %r9, %r3, %r7;
 ; CHECK-NEXT:    add.s32 %r10, %r1, %r5;
 ; CHECK-NEXT:    add.s32 %r11, %r4, %r8;
@@ -981,9 +981,9 @@ define i32 @reduce_add_i32_nonpow2(<7 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r7, [reduce_add_i32_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [reduce_add_i32_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_add_i32_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_add_i32_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_nonpow2_param_0];
 ; CHECK-NEXT:    add.s32 %r8, %r3, %r7;
 ; CHECK-NEXT:    add.s32 %r9, %r1, %r5;
 ; CHECK-NEXT:    add.s32 %r10, %r9, %r8;
@@ -1003,7 +1003,7 @@ define i16 @reduce_mul_i16(<8 x i16> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_mul_i16_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i16_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
 ; CHECK-NEXT:    mul.lo.s16 %rs5, %rs3, %rs1;
@@ -1029,10 +1029,10 @@ define i16 @reduce_mul_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [reduce_mul_i16_nonpow2_param_0+8];
+; CHECK-NEXT:    ld.param.b32 %r1, [reduce_mul_i16_nonpow2_param_0+8];
 ; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT:    ld.param.u16 %rs7, [reduce_mul_i16_nonpow2_param_0+12];
-; CHECK-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_mul_i16_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs7, [reduce_mul_i16_nonpow2_param_0+12];
+; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_mul_i16_nonpow2_param_0];
 ; CHECK-NEXT:    mul.lo.s16 %rs8, %rs3, %rs7;
 ; CHECK-NEXT:    mul.lo.s16 %rs9, %rs1, %rs5;
 ; CHECK-NEXT:    mul.lo.s16 %rs10, %rs9, %rs8;
@@ -1052,8 +1052,8 @@ define i32 @reduce_mul_i32(<8 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0];
 ; CHECK-NEXT:    mul.lo.s32 %r9, %r3, %r7;
 ; CHECK-NEXT:    mul.lo.s32 %r10, %r1, %r5;
 ; CHECK-NEXT:    mul.lo.s32 %r11, %r4, %r8;
@@ -1073,9 +1073,9 @@ define i32 @reduce_mul_i32_nonpow2(<7 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r7, [reduce_mul_i32_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [reduce_mul_i32_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_mul_i32_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_mul_i32_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_nonpow2_param_0];
 ; CHECK-NEXT:    mul.lo.s32 %r8, %r3, %r7;
 ; CHECK-NEXT:    mul.lo.s32 %r9, %r1, %r5;
 ; CHECK-NEXT:    mul.lo.s32 %r10, %r9, %r8;
@@ -1095,7 +1095,7 @@ define i16 @reduce_umax_i16(<8 x i16> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<6>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0];
 ; CHECK-SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
 ; CHECK-SM80-NEXT:    max.u16 %rs5, %rs3, %rs1;
@@ -1117,7 +1117,7 @@ define i16 @reduce_umax_i16(<8 x i16> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<11>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0];
 ; CHECK-SM100-NEXT:    max.u16x2 %r5, %r2, %r4;
 ; CHECK-SM100-NEXT:    max.u16x2 %r6, %r1, %r3;
 ; CHECK-SM100-NEXT:    max.u16x2 %r7, %r6, %r5;
@@ -1140,10 +1140,10 @@ define i16 @reduce_umax_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<3>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.u32 %r1, [reduce_umax_i16_nonpow2_param_0+8];
+; CHECK-SM80-NEXT:    ld.param.b32 %r1, [reduce_umax_i16_nonpow2_param_0+8];
 ; CHECK-SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-NEXT:    ld.param.u16 %rs7, [reduce_umax_i16_nonpow2_param_0+12];
-; CHECK-SM80-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0];
+; CHECK-SM80-NEXT:    ld.param.b16 %rs7, [reduce_umax_i16_nonpow2_param_0+12];
+; CHECK-SM80-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0];
 ; CHECK-SM80-NEXT:    max.u16 %rs8, %rs3, %rs7;
 ; CHECK-SM80-NEXT:    max.u16 %rs9, %rs1, %rs5;
 ; CHECK-SM80-NEXT:    max.u16 %rs10, %rs9, %rs8;
@@ -1160,12 +1160,12 @@ define i16 @reduce_umax_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<9>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.u32 %r1, [reduce_umax_i16_nonpow2_param_0+8];
+; CHECK-SM100-NEXT:    ld.param.b32 %r1, [reduce_umax_i16_nonpow2_param_0+8];
 ; CHECK-SM100-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0];
 ; CHECK-SM100-NEXT:    mov.b32 %r2, {%rs1, %rs2};
 ; CHECK-SM100-NEXT:    mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-NEXT:    ld.param.u16 %rs7, [reduce_umax_i16_nonpow2_param_0+12];
+; CHECK-SM100-NEXT:    ld.param.b16 %rs7, [reduce_umax_i16_nonpow2_param_0+12];
 ; CHECK-SM100-NEXT:    mov.b16 %rs8, 0;
 ; CHECK-SM100-NEXT:    mov.b32 %r4, {%rs7, %rs8};
 ; CHECK-SM100-NEXT:    max.u16x2 %r5, %r3, %r4;
@@ -1186,8 +1186,8 @@ define i32 @reduce_umax_i32(<8 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0];
 ; CHECK-NEXT:    max.u32 %r9, %r3, %r7;
 ; CHECK-NEXT:    max.u32 %r10, %r1, %r5;
 ; CHECK-NEXT:    max.u32 %r11, %r4, %r8;
@@ -1207,9 +1207,9 @@ define i32 @reduce_umax_i32_nonpow2(<7 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r7, [reduce_umax_i32_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [reduce_umax_i32_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_umax_i32_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_umax_i32_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_nonpow2_param_0];
 ; CHECK-NEXT:    max.u32 %r8, %r3, %r7;
 ; CHECK-NEXT:    max.u32 %r9, %r1, %r5;
 ; CHECK-NEXT:    max.u32 %r10, %r9, %r8;
@@ -1229,7 +1229,7 @@ define i16 @reduce_umin_i16(<8 x i16> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<6>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0];
 ; CHECK-SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
 ; CHECK-SM80-NEXT:    min.u16 %rs5, %rs3, %rs1;
@@ -1251,7 +1251,7 @@ define i16 @reduce_umin_i16(<8 x i16> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<11>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0];
 ; CHECK-SM100-NEXT:    min.u16x2 %r5, %r2, %r4;
 ; CHECK-SM100-NEXT:    min.u16x2 %r6, %r1, %r3;
 ; CHECK-SM100-NEXT:    min.u16x2 %r7, %r6, %r5;
@@ -1274,10 +1274,10 @@ define i16 @reduce_umin_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<3>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.u32 %r1, [reduce_umin_i16_nonpow2_param_0+8];
+; CHECK-SM80-NEXT:    ld.param.b32 %r1, [reduce_umin_i16_nonpow2_param_0+8];
 ; CHECK-SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-NEXT:    ld.param.u16 %rs7, [reduce_umin_i16_nonpow2_param_0+12];
-; CHECK-SM80-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0];
+; CHECK-SM80-NEXT:    ld.param.b16 %rs7, [reduce_umin_i16_nonpow2_param_0+12];
+; CHECK-SM80-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0];
 ; CHECK-SM80-NEXT:    min.u16 %rs8, %rs3, %rs7;
 ; CHECK-SM80-NEXT:    min.u16 %rs9, %rs1, %rs5;
 ; CHECK-SM80-NEXT:    min.u16 %rs10, %rs9, %rs8;
@@ -1294,12 +1294,12 @@ define i16 @reduce_umin_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<9>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.u32 %r1, [reduce_umin_i16_nonpow2_param_0+8];
+; CHECK-SM100-NEXT:    ld.param.b32 %r1, [reduce_umin_i16_nonpow2_param_0+8];
 ; CHECK-SM100-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0];
 ; CHECK-SM100-NEXT:    mov.b32 %r2, {%rs1, %rs2};
 ; CHECK-SM100-NEXT:    mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-NEXT:    ld.param.u16 %rs7, [reduce_umin_i16_nonpow2_param_0+12];
+; CHECK-SM100-NEXT:    ld.param.b16 %rs7, [reduce_umin_i16_nonpow2_param_0+12];
 ; CHECK-SM100-NEXT:    mov.b16 %rs8, -1;
 ; CHECK-SM100-NEXT:    mov.b32 %r4, {%rs7, %rs8};
 ; CHECK-SM100-NEXT:    min.u16x2 %r5, %r3, %r4;
@@ -1320,8 +1320,8 @@ define i32 @reduce_umin_i32(<8 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0];
 ; CHECK-NEXT:    min.u32 %r9, %r3, %r7;
 ; CHECK-NEXT:    min.u32 %r10, %r1, %r5;
 ; CHECK-NEXT:    min.u32 %r11, %r4, %r8;
@@ -1341,9 +1341,9 @@ define i32 @reduce_umin_i32_nonpow2(<7 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r7, [reduce_umin_i32_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [reduce_umin_i32_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_umin_i32_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_umin_i32_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_nonpow2_param_0];
 ; CHECK-NEXT:    min.u32 %r8, %r3, %r7;
 ; CHECK-NEXT:    min.u32 %r9, %r1, %r5;
 ; CHECK-NEXT:    min.u32 %r10, %r9, %r8;
@@ -1363,7 +1363,7 @@ define i16 @reduce_smax_i16(<8 x i16> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<6>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0];
 ; CHECK-SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
 ; CHECK-SM80-NEXT:    max.s16 %rs5, %rs3, %rs1;
@@ -1385,7 +1385,7 @@ define i16 @reduce_smax_i16(<8 x i16> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<11>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0];
 ; CHECK-SM100-NEXT:    max.s16x2 %r5, %r2, %r4;
 ; CHECK-SM100-NEXT:    max.s16x2 %r6, %r1, %r3;
 ; CHECK-SM100-NEXT:    max.s16x2 %r7, %r6, %r5;
@@ -1408,10 +1408,10 @@ define i16 @reduce_smax_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<3>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.u32 %r1, [reduce_smax_i16_nonpow2_param_0+8];
+; CHECK-SM80-NEXT:    ld.param.b32 %r1, [reduce_smax_i16_nonpow2_param_0+8];
 ; CHECK-SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-NEXT:    ld.param.u16 %rs7, [reduce_smax_i16_nonpow2_param_0+12];
-; CHECK-SM80-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0];
+; CHECK-SM80-NEXT:    ld.param.b16 %rs7, [reduce_smax_i16_nonpow2_param_0+12];
+; CHECK-SM80-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0];
 ; CHECK-SM80-NEXT:    max.s16 %rs8, %rs3, %rs7;
 ; CHECK-SM80-NEXT:    max.s16 %rs9, %rs1, %rs5;
 ; CHECK-SM80-NEXT:    max.s16 %rs10, %rs9, %rs8;
@@ -1428,12 +1428,12 @@ define i16 @reduce_smax_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<9>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.u32 %r1, [reduce_smax_i16_nonpow2_param_0+8];
+; CHECK-SM100-NEXT:    ld.param.b32 %r1, [reduce_smax_i16_nonpow2_param_0+8];
 ; CHECK-SM100-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0];
 ; CHECK-SM100-NEXT:    mov.b32 %r2, {%rs1, %rs2};
 ; CHECK-SM100-NEXT:    mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-NEXT:    ld.param.u16 %rs7, [reduce_smax_i16_nonpow2_param_0+12];
+; CHECK-SM100-NEXT:    ld.param.b16 %rs7, [reduce_smax_i16_nonpow2_param_0+12];
 ; CHECK-SM100-NEXT:    mov.b16 %rs8, -32768;
 ; CHECK-SM100-NEXT:    mov.b32 %r4, {%rs7, %rs8};
 ; CHECK-SM100-NEXT:    max.s16x2 %r5, %r3, %r4;
@@ -1454,8 +1454,8 @@ define i32 @reduce_smax_i32(<8 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0];
 ; CHECK-NEXT:    max.s32 %r9, %r3, %r7;
 ; CHECK-NEXT:    max.s32 %r10, %r1, %r5;
 ; CHECK-NEXT:    max.s32 %r11, %r4, %r8;
@@ -1475,9 +1475,9 @@ define i32 @reduce_smax_i32_nonpow2(<7 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r7, [reduce_smax_i32_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [reduce_smax_i32_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_smax_i32_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_smax_i32_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_nonpow2_param_0];
 ; CHECK-NEXT:    max.s32 %r8, %r3, %r7;
 ; CHECK-NEXT:    max.s32 %r9, %r1, %r5;
 ; CHECK-NEXT:    max.s32 %r10, %r9, %r8;
@@ -1497,7 +1497,7 @@ define i16 @reduce_smin_i16(<8 x i16> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<6>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0];
 ; CHECK-SM80-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-SM80-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
 ; CHECK-SM80-NEXT:    min.s16 %rs5, %rs3, %rs1;
@@ -1519,7 +1519,7 @@ define i16 @reduce_smin_i16(<8 x i16> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<11>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0];
 ; CHECK-SM100-NEXT:    min.s16x2 %r5, %r2, %r4;
 ; CHECK-SM100-NEXT:    min.s16x2 %r6, %r1, %r3;
 ; CHECK-SM100-NEXT:    min.s16x2 %r7, %r6, %r5;
@@ -1542,10 +1542,10 @@ define i16 @reduce_smin_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<3>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.u32 %r1, [reduce_smin_i16_nonpow2_param_0+8];
+; CHECK-SM80-NEXT:    ld.param.b32 %r1, [reduce_smin_i16_nonpow2_param_0+8];
 ; CHECK-SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM80-NEXT:    ld.param.u16 %rs7, [reduce_smin_i16_nonpow2_param_0+12];
-; CHECK-SM80-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0];
+; CHECK-SM80-NEXT:    ld.param.b16 %rs7, [reduce_smin_i16_nonpow2_param_0+12];
+; CHECK-SM80-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0];
 ; CHECK-SM80-NEXT:    min.s16 %rs8, %rs3, %rs7;
 ; CHECK-SM80-NEXT:    min.s16 %rs9, %rs1, %rs5;
 ; CHECK-SM80-NEXT:    min.s16 %rs10, %rs9, %rs8;
@@ -1562,12 +1562,12 @@ define i16 @reduce_smin_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<9>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.u32 %r1, [reduce_smin_i16_nonpow2_param_0+8];
+; CHECK-SM100-NEXT:    ld.param.b32 %r1, [reduce_smin_i16_nonpow2_param_0+8];
 ; CHECK-SM100-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM100-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0];
 ; CHECK-SM100-NEXT:    mov.b32 %r2, {%rs1, %rs2};
 ; CHECK-SM100-NEXT:    mov.b32 %r3, {%rs3, %rs4};
-; CHECK-SM100-NEXT:    ld.param.u16 %rs7, [reduce_smin_i16_nonpow2_param_0+12];
+; CHECK-SM100-NEXT:    ld.param.b16 %rs7, [reduce_smin_i16_nonpow2_param_0+12];
 ; CHECK-SM100-NEXT:    mov.b16 %rs8, 32767;
 ; CHECK-SM100-NEXT:    mov.b32 %r4, {%rs7, %rs8};
 ; CHECK-SM100-NEXT:    min.s16x2 %r5, %r3, %r4;
@@ -1588,8 +1588,8 @@ define i32 @reduce_smin_i32(<8 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0];
 ; CHECK-NEXT:    min.s32 %r9, %r3, %r7;
 ; CHECK-NEXT:    min.s32 %r10, %r1, %r5;
 ; CHECK-NEXT:    min.s32 %r11, %r4, %r8;
@@ -1609,9 +1609,9 @@ define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r7, [reduce_smin_i32_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [reduce_smin_i32_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_smin_i32_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_smin_i32_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_nonpow2_param_0];
 ; CHECK-NEXT:    min.s32 %r8, %r3, %r7;
 ; CHECK-NEXT:    min.s32 %r9, %r1, %r5;
 ; CHECK-NEXT:    min.s32 %r10, %r9, %r8;
@@ -1631,7 +1631,7 @@ define i16 @reduce_and_i16(<8 x i16> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<11>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0];
 ; CHECK-SM80-NEXT:    and.b32 %r5, %r2, %r4;
 ; CHECK-SM80-NEXT:    and.b32 %r6, %r1, %r3;
 ; CHECK-SM80-NEXT:    and.b32 %r7, %r6, %r5;
@@ -1650,7 +1650,7 @@ define i16 @reduce_and_i16(<8 x i16> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<11>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0];
 ; CHECK-SM100-NEXT:    and.b32 %r5, %r2, %r4;
 ; CHECK-SM100-NEXT:    and.b32 %r6, %r1, %r3;
 ; CHECK-SM100-NEXT:    and.b32 %r7, %r6, %r5;
@@ -1673,12 +1673,12 @@ define i16 @reduce_and_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [reduce_and_i16_nonpow2_param_0+8];
+; CHECK-NEXT:    ld.param.b32 %r1, [reduce_and_i16_nonpow2_param_0+8];
 ; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_and_i16_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_and_i16_nonpow2_param_0];
 ; CHECK-NEXT:    mov.b32 %r2, {%rs1, %rs2};
 ; CHECK-NEXT:    mov.b32 %r3, {%rs3, %rs4};
-; CHECK-NEXT:    ld.param.u16 %rs7, [reduce_and_i16_nonpow2_param_0+12];
+; CHECK-NEXT:    ld.param.b16 %rs7, [reduce_and_i16_nonpow2_param_0+12];
 ; CHECK-NEXT:    mov.b16 %rs8, -1;
 ; CHECK-NEXT:    mov.b32 %r4, {%rs7, %rs8};
 ; CHECK-NEXT:    and.b32 %r5, %r3, %r4;
@@ -1699,8 +1699,8 @@ define i32 @reduce_and_i32(<8 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0];
 ; CHECK-NEXT:    and.b32 %r9, %r3, %r7;
 ; CHECK-NEXT:    and.b32 %r10, %r1, %r5;
 ; CHECK-NEXT:    and.b32 %r11, %r4, %r8;
@@ -1720,9 +1720,9 @@ define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r7, [reduce_and_i32_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [reduce_and_i32_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_and_i32_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_and_i32_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_nonpow2_param_0];
 ; CHECK-NEXT:    and.b32 %r8, %r3, %r7;
 ; CHECK-NEXT:    and.b32 %r9, %r1, %r5;
 ; CHECK-NEXT:    and.b32 %r10, %r9, %r8;
@@ -1742,7 +1742,7 @@ define i16 @reduce_or_i16(<8 x i16> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<11>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0];
 ; CHECK-SM80-NEXT:    or.b32 %r5, %r2, %r4;
 ; CHECK-SM80-NEXT:    or.b32 %r6, %r1, %r3;
 ; CHECK-SM80-NEXT:    or.b32 %r7, %r6, %r5;
@@ -1761,7 +1761,7 @@ define i16 @reduce_or_i16(<8 x i16> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<11>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0];
 ; CHECK-SM100-NEXT:    or.b32 %r5, %r2, %r4;
 ; CHECK-SM100-NEXT:    or.b32 %r6, %r1, %r3;
 ; CHECK-SM100-NEXT:    or.b32 %r7, %r6, %r5;
@@ -1784,12 +1784,12 @@ define i16 @reduce_or_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [reduce_or_i16_nonpow2_param_0+8];
+; CHECK-NEXT:    ld.param.b32 %r1, [reduce_or_i16_nonpow2_param_0+8];
 ; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_or_i16_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_or_i16_nonpow2_param_0];
 ; CHECK-NEXT:    mov.b32 %r2, {%rs1, %rs2};
 ; CHECK-NEXT:    mov.b32 %r3, {%rs3, %rs4};
-; CHECK-NEXT:    ld.param.u16 %rs7, [reduce_or_i16_nonpow2_param_0+12];
+; CHECK-NEXT:    ld.param.b16 %rs7, [reduce_or_i16_nonpow2_param_0+12];
 ; CHECK-NEXT:    mov.b16 %rs8, 0;
 ; CHECK-NEXT:    mov.b32 %r4, {%rs7, %rs8};
 ; CHECK-NEXT:    or.b32 %r5, %r3, %r4;
@@ -1810,8 +1810,8 @@ define i32 @reduce_or_i32(<8 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0];
 ; CHECK-NEXT:    or.b32 %r9, %r3, %r7;
 ; CHECK-NEXT:    or.b32 %r10, %r1, %r5;
 ; CHECK-NEXT:    or.b32 %r11, %r4, %r8;
@@ -1831,9 +1831,9 @@ define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r7, [reduce_or_i32_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [reduce_or_i32_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_or_i32_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_or_i32_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_nonpow2_param_0];
 ; CHECK-NEXT:    or.b32 %r8, %r3, %r7;
 ; CHECK-NEXT:    or.b32 %r9, %r1, %r5;
 ; CHECK-NEXT:    or.b32 %r10, %r9, %r8;
@@ -1853,7 +1853,7 @@ define i16 @reduce_xor_i16(<8 x i16> %in) {
 ; CHECK-SM80-NEXT:    .reg .b32 %r<11>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0];
+; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0];
 ; CHECK-SM80-NEXT:    xor.b32 %r5, %r2, %r4;
 ; CHECK-SM80-NEXT:    xor.b32 %r6, %r1, %r3;
 ; CHECK-SM80-NEXT:    xor.b32 %r7, %r6, %r5;
@@ -1872,7 +1872,7 @@ define i16 @reduce_xor_i16(<8 x i16> %in) {
 ; CHECK-SM100-NEXT:    .reg .b32 %r<11>;
 ; CHECK-SM100-EMPTY:
 ; CHECK-SM100-NEXT:  // %bb.0:
-; CHECK-SM100-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0];
+; CHECK-SM100-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0];
 ; CHECK-SM100-NEXT:    xor.b32 %r5, %r2, %r4;
 ; CHECK-SM100-NEXT:    xor.b32 %r6, %r1, %r3;
 ; CHECK-SM100-NEXT:    xor.b32 %r7, %r6, %r5;
@@ -1895,12 +1895,12 @@ define i16 @reduce_xor_i16_nonpow2(<7 x i16> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [reduce_xor_i16_nonpow2_param_0+8];
+; CHECK-NEXT:    ld.param.b32 %r1, [reduce_xor_i16_nonpow2_param_0+8];
 ; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NEXT:    ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_xor_i16_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_xor_i16_nonpow2_param_0];
 ; CHECK-NEXT:    mov.b32 %r2, {%rs1, %rs2};
 ; CHECK-NEXT:    mov.b32 %r3, {%rs3, %rs4};
-; CHECK-NEXT:    ld.param.u16 %rs7, [reduce_xor_i16_nonpow2_param_0+12];
+; CHECK-NEXT:    ld.param.b16 %rs7, [reduce_xor_i16_nonpow2_param_0+12];
 ; CHECK-NEXT:    mov.b16 %rs8, 0;
 ; CHECK-NEXT:    mov.b32 %r4, {%rs7, %rs8};
 ; CHECK-NEXT:    xor.b32 %r5, %r3, %r4;
@@ -1921,8 +1921,8 @@ define i32 @reduce_xor_i32(<8 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0];
 ; CHECK-NEXT:    xor.b32 %r9, %r3, %r7;
 ; CHECK-NEXT:    xor.b32 %r10, %r1, %r5;
 ; CHECK-NEXT:    xor.b32 %r11, %r4, %r8;
@@ -1942,9 +1942,9 @@ define i32 @reduce_xor_i32_nonpow2(<7 x i32> %in) {
 ; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r7, [reduce_xor_i32_nonpow2_param_0+24];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [reduce_xor_i32_nonpow2_param_0+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_nonpow2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r7, [reduce_xor_i32_nonpow2_param_0+24];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [reduce_xor_i32_nonpow2_param_0+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_nonpow2_param_0];
 ; CHECK-NEXT:    xor.b32 %r8, %r3, %r7;
 ; CHECK-NEXT:    xor.b32 %r9, %r1, %r5;
 ; CHECK-NEXT:    xor.b32 %r10, %r9, %r8;
diff --git a/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll b/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll
index ed785298f5900..dbc10757dc43b 100644
--- a/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll
+++ b/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll
@@ -10,10 +10,10 @@ define float @redux_sync_fmin(float %src, i32 %mask) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [redux_sync_fmin_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [redux_sync_fmin_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmin_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmin_param_1];
 ; CHECK-NEXT:    redux.sync.min.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmin(float %src, i32 %mask)
   ret float %val
@@ -27,10 +27,10 @@ define float @redux_sync_fmin_abs(float %src, i32 %mask) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [redux_sync_fmin_abs_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [redux_sync_fmin_abs_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmin_abs_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmin_abs_param_1];
 ; CHECK-NEXT:    redux.sync.min.abs.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmin.abs(float %src, i32 %mask)
   ret float %val
@@ -44,10 +44,10 @@ define float @redux_sync_fmin_NaN(float %src, i32 %mask) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [redux_sync_fmin_NaN_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [redux_sync_fmin_NaN_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmin_NaN_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmin_NaN_param_1];
 ; CHECK-NEXT:    redux.sync.min.NaN.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmin.NaN(float %src, i32 %mask)
   ret float %val
@@ -61,10 +61,10 @@ define float @redux_sync_fmin_abs_NaN(float %src, i32 %mask) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [redux_sync_fmin_abs_NaN_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [redux_sync_fmin_abs_NaN_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmin_abs_NaN_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmin_abs_NaN_param_1];
 ; CHECK-NEXT:    redux.sync.min.abs.NaN.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmin.abs.NaN(float %src, i32 %mask)
   ret float %val
@@ -78,10 +78,10 @@ define float @redux_sync_fmax(float %src, i32 %mask) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [redux_sync_fmax_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [redux_sync_fmax_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmax_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmax_param_1];
 ; CHECK-NEXT:    redux.sync.max.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmax(float %src, i32 %mask)
   ret float %val
@@ -95,10 +95,10 @@ define float @redux_sync_fmax_abs(float %src, i32 %mask) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [redux_sync_fmax_abs_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [redux_sync_fmax_abs_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmax_abs_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmax_abs_param_1];
 ; CHECK-NEXT:    redux.sync.max.abs.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmax.abs(float %src, i32 %mask)
   ret float %val
@@ -112,10 +112,10 @@ define float @redux_sync_fmax_NaN(float %src, i32 %mask) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [redux_sync_fmax_NaN_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [redux_sync_fmax_NaN_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmax_NaN_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmax_NaN_param_1];
 ; CHECK-NEXT:    redux.sync.max.NaN.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmax.NaN(float %src, i32 %mask)
   ret float %val
@@ -129,10 +129,10 @@ define float @redux_sync_fmax_abs_NaN(float %src, i32 %mask) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [redux_sync_fmax_abs_NaN_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [redux_sync_fmax_abs_NaN_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [redux_sync_fmax_abs_NaN_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [redux_sync_fmax_abs_NaN_param_1];
 ; CHECK-NEXT:    redux.sync.max.abs.NaN.f32 %f2, %f1, %r1;
-; CHECK-NEXT:    st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %f2;
 ; CHECK-NEXT:    ret;
   %val = call float @llvm.nvvm.redux.sync.fmax.abs.NaN(float %src, i32 %mask)
   ret float %val
diff --git a/llvm/test/CodeGen/NVPTX/reg-types.ll b/llvm/test/CodeGen/NVPTX/reg-types.ll
index 7b4ebcae8a67c..4b50bca7f8efe 100644
--- a/llvm/test/CodeGen/NVPTX/reg-types.ll
+++ b/llvm/test/CodeGen/NVPTX/reg-types.ll
@@ -32,28 +32,28 @@ entry:
 ; Verify that we use correct register types.
   store i8 1, ptr %s8, align 1
 ; CHECK: mov.b16 [[R1:%rs[0-9]]], 1;
-; CHECK-NEXT: st.u8 {{.*}}, [[R1]]
+; CHECK-NEXT: st.b8 {{.*}}, [[R1]]
   store i8 2, ptr %u8, align 1
 ; CHECK: mov.b16 [[R2:%rs[0-9]]], 2;
-; CHECK-NEXT: st.u8 {{.*}}, [[R2]]
+; CHECK-NEXT: st.b8 {{.*}}, [[R2]]
   store i16 3, ptr %s16, align 2
 ; CHECK: mov.b16 [[R3:%rs[0-9]]], 3;
-; CHECK-NEXT: st.u16 {{.*}}, [[R3]]
+; CHECK-NEXT: st.b16 {{.*}}, [[R3]]
   store i16 4, ptr %u16, align 2
 ; CHECK: mov.b16 [[R4:%rs[0-9]]], 4;
-; CHECK-NEXT: st.u16 {{.*}}, [[R4]]
+; CHECK-NEXT: st.b16 {{.*}}, [[R4]]
   store i32 5, ptr %s32, align 4
 ; CHECK: mov.b32 [[R5:%r[0-9]]], 5;
-; CHECK-NEXT: st.u32 {{.*}}, [[R5]]
+; CHECK-NEXT: st.b32 {{.*}}, [[R5]]
   store i32 6, ptr %u32, align 4
 ; CHECK: mov.b32 [[R6:%r[0-9]]], 6;
-; CHECK-NEXT: st.u32 {{.*}}, [[R6]]
+; CHECK-NEXT: st.b32 {{.*}}, [[R6]]
   store i64 7, ptr %s64, align 8
 ; CHECK: mov.b64 [[R7:%rd[0-9]]], 7;
-; CHECK-NEXT: st.u64 {{.*}}, [[R7]]
+; CHECK-NEXT: st.b64 {{.*}}, [[R7]]
   store i64 8, ptr %u64, align 8
 ; CHECK: mov.b64 [[R8:%rd[0-9]]], 8;
-; CHECK-NEXT: st.u64 {{.*}}, [[R8]]
+; CHECK-NEXT: st.b64 {{.*}}, [[R8]]
 
 ; FP constants are stored via integer registers, but that's an
 ; implementation detail that's irrelevant here.
@@ -61,9 +61,9 @@ entry:
   store double 1.000000e+01, ptr %f64, align 8
 ; Instead, we force a load into a register and then verify register type.
   %f32v = load volatile float, ptr %f32, align 4
-; CHECK: ld.volatile.f32         %f{{[0-9]+}}
+; CHECK: ld.volatile.b32         %f{{[0-9]+}}
   %f64v = load volatile double, ptr %f64, align 8
-; CHECK: ld.volatile.f64         %fd{{[0-9]+}}
+; CHECK: ld.volatile.b64         %fd{{[0-9]+}}
   ret void
 ; CHECK: ret;
 ; NO8BIT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/rotate-add.ll b/llvm/test/CodeGen/NVPTX/rotate-add.ll
index 820e8000a5657..aada7eadce2aa 100644
--- a/llvm/test/CodeGen/NVPTX/rotate-add.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate-add.ll
@@ -10,7 +10,7 @@ define i32 @test_simple_rotl(i32 %x) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_simple_rotl_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_simple_rotl_param_0];
 ; CHECK-NEXT:    shf.l.wrap.b32 %r2, %r1, %r1, 7;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -26,7 +26,7 @@ define i32 @test_simple_rotr(i32 %x) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_simple_rotr_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_simple_rotr_param_0];
 ; CHECK-NEXT:    shf.l.wrap.b32 %r2, %r1, %r1, 25;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
@@ -42,8 +42,8 @@ define i32 @test_rotl_var(i32 %x, i32 %y) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_rotl_var_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_rotl_var_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rotl_var_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_rotl_var_param_1];
 ; CHECK-NEXT:    shf.l.wrap.b32 %r3, %r1, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -60,8 +60,8 @@ define i32 @test_rotr_var(i32 %x, i32 %y) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_rotr_var_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_rotr_var_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rotr_var_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_rotr_var_param_1];
 ; CHECK-NEXT:    shf.r.wrap.b32 %r3, %r1, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
@@ -78,8 +78,8 @@ define i32 @test_invalid_rotl_var_and(i32 %x, i32 %y) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_invalid_rotl_var_and_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_invalid_rotl_var_and_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_invalid_rotl_var_and_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_invalid_rotl_var_and_param_1];
 ; CHECK-NEXT:    shl.b32 %r3, %r1, %r2;
 ; CHECK-NEXT:    neg.s32 %r4, %r2;
 ; CHECK-NEXT:    and.b32 %r5, %r4, 31;
@@ -101,8 +101,8 @@ define i32 @test_invalid_rotr_var_and(i32 %x, i32 %y) {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_invalid_rotr_var_and_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_invalid_rotr_var_and_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_invalid_rotr_var_and_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_invalid_rotr_var_and_param_1];
 ; CHECK-NEXT:    shr.u32 %r3, %r1, %r2;
 ; CHECK-NEXT:    neg.s32 %r4, %r2;
 ; CHECK-NEXT:    and.b32 %r5, %r4, 31;
@@ -124,9 +124,9 @@ define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_fshl_special_case_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_fshl_special_case_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_fshl_special_case_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fshl_special_case_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_fshl_special_case_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_fshl_special_case_param_2];
 ; CHECK-NEXT:    shf.l.wrap.b32 %r4, %r2, %r1, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -144,9 +144,9 @@ define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_fshr_special_case_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_fshr_special_case_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_fshr_special_case_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fshr_special_case_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_fshr_special_case_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_fshr_special_case_param_2];
 ; CHECK-NEXT:    shf.r.wrap.b32 %r4, %r2, %r1, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -165,7 +165,7 @@ define i64 @test_rotl_udiv_special_case(i64 %i) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_rotl_udiv_special_case_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_rotl_udiv_special_case_param_0];
 ; CHECK-NEXT:    mul.hi.u64 %rd2, %rd1, -6148914691236517205;
 ; CHECK-NEXT:    shr.u64 %rd3, %rd2, 1;
 ; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd3;
@@ -187,7 +187,7 @@ define i32 @test_rotl_mul_special_case(i32 %i) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_rotl_mul_special_case_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rotl_mul_special_case_param_0];
 ; CHECK-NEXT:    mul.lo.s32 %r2, %r1, 9;
 ; CHECK-NEXT:    shf.l.wrap.b32 %r3, %r2, %r2, 7;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -206,7 +206,7 @@ define i64 @test_rotl_mul_with_mask_special_case(i64 %i) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_rotl_mul_with_mask_special_case_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_rotl_mul_with_mask_special_case_param_0];
 ; CHECK-NEXT:    mul.lo.s64 %rd2, %rd1, 9;
 ; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
@@ -230,7 +230,7 @@ define i32 @test_fshl_with_mask_special_case(i32 %x) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_fshl_with_mask_special_case_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fshl_with_mask_special_case_param_0];
 ; CHECK-NEXT:    or.b32 %r2, %r1, 1;
 ; CHECK-NEXT:    shf.l.wrap.b32 %r3, %r1, %r2, 5;
 ; CHECK-NEXT:    and.b32 %r4, %r3, -31;
diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
index f77fb4115567b..2d7fa40e5be7b 100644
--- a/llvm/test/CodeGen/NVPTX/rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -21,8 +21,8 @@ define i32 @rotate32(i32 %a, i32 %b) {
 ; SM20-NEXT:    .reg .b32 %r<9>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u32 %r1, [rotate32_param_0];
-; SM20-NEXT:    ld.param.u32 %r2, [rotate32_param_1];
+; SM20-NEXT:    ld.param.b32 %r1, [rotate32_param_0];
+; SM20-NEXT:    ld.param.b32 %r2, [rotate32_param_1];
 ; SM20-NEXT:    and.b32 %r3, %r2, 31;
 ; SM20-NEXT:    shl.b32 %r4, %r1, %r3;
 ; SM20-NEXT:    neg.s32 %r5, %r2;
@@ -37,8 +37,8 @@ define i32 @rotate32(i32 %a, i32 %b) {
 ; SM35-NEXT:    .reg .b32 %r<4>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u32 %r1, [rotate32_param_0];
-; SM35-NEXT:    ld.param.u32 %r2, [rotate32_param_1];
+; SM35-NEXT:    ld.param.b32 %r1, [rotate32_param_0];
+; SM35-NEXT:    ld.param.b32 %r2, [rotate32_param_1];
 ; SM35-NEXT:    shf.l.wrap.b32 %r3, %r1, %r1, %r2;
 ; SM35-NEXT:    st.param.b32 [func_retval0], %r3;
 ; SM35-NEXT:    ret;
@@ -53,8 +53,8 @@ define i64 @rotate64(i64 %a, i32 %b) {
 ; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [rotate64_param_0];
-; SM20-NEXT:    ld.param.u32 %r1, [rotate64_param_1];
+; SM20-NEXT:    ld.param.b64 %rd1, [rotate64_param_0];
+; SM20-NEXT:    ld.param.b32 %r1, [rotate64_param_1];
 ; SM20-NEXT:    and.b32 %r2, %r1, 63;
 ; SM20-NEXT:    shl.b64 %rd2, %rd1, %r2;
 ; SM20-NEXT:    neg.s32 %r3, %r1;
@@ -70,8 +70,8 @@ define i64 @rotate64(i64 %a, i32 %b) {
 ; SM35-NEXT:    .reg .b64 %rd<5>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [rotate64_param_0];
-; SM35-NEXT:    ld.param.u32 %r1, [rotate64_param_1];
+; SM35-NEXT:    ld.param.b64 %rd1, [rotate64_param_0];
+; SM35-NEXT:    ld.param.b32 %r1, [rotate64_param_1];
 ; SM35-NEXT:    and.b32 %r2, %r1, 63;
 ; SM35-NEXT:    shl.b64 %rd2, %rd1, %r2;
 ; SM35-NEXT:    neg.s32 %r3, %r1;
@@ -91,8 +91,8 @@ define i64 @rotateright64(i64 %a, i32 %b) {
 ; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [rotateright64_param_0];
-; SM20-NEXT:    ld.param.u32 %r1, [rotateright64_param_1];
+; SM20-NEXT:    ld.param.b64 %rd1, [rotateright64_param_0];
+; SM20-NEXT:    ld.param.b32 %r1, [rotateright64_param_1];
 ; SM20-NEXT:    and.b32 %r2, %r1, 63;
 ; SM20-NEXT:    shr.u64 %rd2, %rd1, %r2;
 ; SM20-NEXT:    neg.s32 %r3, %r1;
@@ -108,8 +108,8 @@ define i64 @rotateright64(i64 %a, i32 %b) {
 ; SM35-NEXT:    .reg .b64 %rd<5>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [rotateright64_param_0];
-; SM35-NEXT:    ld.param.u32 %r1, [rotateright64_param_1];
+; SM35-NEXT:    ld.param.b64 %rd1, [rotateright64_param_0];
+; SM35-NEXT:    ld.param.b32 %r1, [rotateright64_param_1];
 ; SM35-NEXT:    and.b32 %r2, %r1, 63;
 ; SM35-NEXT:    shr.u64 %rd2, %rd1, %r2;
 ; SM35-NEXT:    neg.s32 %r3, %r1;
@@ -128,7 +128,7 @@ define i32 @rotl0(i32 %x) {
 ; SM20-NEXT:    .reg .b32 %r<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u32 %r1, [rotl0_param_0];
+; SM20-NEXT:    ld.param.b32 %r1, [rotl0_param_0];
 ; SM20-NEXT:    shr.u32 %r2, %r1, 24;
 ; SM20-NEXT:    shl.b32 %r3, %r1, 8;
 ; SM20-NEXT:    or.b32 %r4, %r3, %r2;
@@ -140,7 +140,7 @@ define i32 @rotl0(i32 %x) {
 ; SM35-NEXT:    .reg .b32 %r<3>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u32 %r1, [rotl0_param_0];
+; SM35-NEXT:    ld.param.b32 %r1, [rotl0_param_0];
 ; SM35-NEXT:    shf.l.wrap.b32 %r2, %r1, %r1, 8;
 ; SM35-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM35-NEXT:    ret;
@@ -157,8 +157,8 @@ define i64 @rotl64(i64 %a, i64 %n) {
 ; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [rotl64_param_0];
-; SM20-NEXT:    ld.param.u32 %r1, [rotl64_param_1];
+; SM20-NEXT:    ld.param.b64 %rd1, [rotl64_param_0];
+; SM20-NEXT:    ld.param.b32 %r1, [rotl64_param_1];
 ; SM20-NEXT:    and.b32 %r2, %r1, 63;
 ; SM20-NEXT:    shl.b64 %rd2, %rd1, %r2;
 ; SM20-NEXT:    neg.s32 %r3, %r1;
@@ -174,8 +174,8 @@ define i64 @rotl64(i64 %a, i64 %n) {
 ; SM35-NEXT:    .reg .b64 %rd<5>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [rotl64_param_0];
-; SM35-NEXT:    ld.param.u32 %r1, [rotl64_param_1];
+; SM35-NEXT:    ld.param.b64 %rd1, [rotl64_param_0];
+; SM35-NEXT:    ld.param.b32 %r1, [rotl64_param_1];
 ; SM35-NEXT:    and.b32 %r2, %r1, 63;
 ; SM35-NEXT:    shl.b64 %rd2, %rd1, %r2;
 ; SM35-NEXT:    neg.s32 %r3, %r1;
@@ -194,7 +194,7 @@ define i64 @rotl64_low_imm(i64 %a) {
 ; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [rotl64_low_imm_param_0];
+; SM20-NEXT:    ld.param.b64 %rd1, [rotl64_low_imm_param_0];
 ; SM20-NEXT:    shr.u64 %rd2, %rd1, 62;
 ; SM20-NEXT:    shl.b64 %rd3, %rd1, 2;
 ; SM20-NEXT:    or.b64 %rd4, %rd3, %rd2;
@@ -207,7 +207,7 @@ define i64 @rotl64_low_imm(i64 %a) {
 ; SM35-NEXT:    .reg .b64 %rd<3>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [rotl64_low_imm_param_0];
+; SM35-NEXT:    ld.param.b64 %rd1, [rotl64_low_imm_param_0];
 ; SM35-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; SM35-NEXT:    shf.l.wrap.b32 %r3, %r1, %r2, 2;
 ; SM35-NEXT:    shf.l.wrap.b32 %r4, %r2, %r1, 2;
@@ -224,7 +224,7 @@ define i64 @rotl64_high_imm(i64 %a) {
 ; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [rotl64_high_imm_param_0];
+; SM20-NEXT:    ld.param.b64 %rd1, [rotl64_high_imm_param_0];
 ; SM20-NEXT:    shr.u64 %rd2, %rd1, 1;
 ; SM20-NEXT:    shl.b64 %rd3, %rd1, 63;
 ; SM20-NEXT:    or.b64 %rd4, %rd3, %rd2;
@@ -237,7 +237,7 @@ define i64 @rotl64_high_imm(i64 %a) {
 ; SM35-NEXT:    .reg .b64 %rd<3>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [rotl64_high_imm_param_0];
+; SM35-NEXT:    ld.param.b64 %rd1, [rotl64_high_imm_param_0];
 ; SM35-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; SM35-NEXT:    shf.l.wrap.b32 %r3, %r2, %r1, 31;
 ; SM35-NEXT:    shf.l.wrap.b32 %r4, %r1, %r2, 31;
@@ -254,7 +254,7 @@ define i64 @rotl64_32_imm(i64 %a) {
 ; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [rotl64_32_imm_param_0];
+; SM20-NEXT:    ld.param.b64 %rd1, [rotl64_32_imm_param_0];
 ; SM20-NEXT:    shr.u64 %rd2, %rd1, 32;
 ; SM20-NEXT:    shl.b64 %rd3, %rd1, 32;
 ; SM20-NEXT:    or.b64 %rd4, %rd3, %rd2;
@@ -267,7 +267,7 @@ define i64 @rotl64_32_imm(i64 %a) {
 ; SM35-NEXT:    .reg .b64 %rd<3>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [rotl64_32_imm_param_0];
+; SM35-NEXT:    ld.param.b64 %rd1, [rotl64_32_imm_param_0];
 ; SM35-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; SM35-NEXT:    mov.b64 %rd2, {%r2, %r1};
 ; SM35-NEXT:    st.param.b64 [func_retval0], %rd2;
@@ -283,8 +283,8 @@ define i64 @rotr64(i64 %a, i64 %n) {
 ; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [rotr64_param_0];
-; SM20-NEXT:    ld.param.u32 %r1, [rotr64_param_1];
+; SM20-NEXT:    ld.param.b64 %rd1, [rotr64_param_0];
+; SM20-NEXT:    ld.param.b32 %r1, [rotr64_param_1];
 ; SM20-NEXT:    and.b32 %r2, %r1, 63;
 ; SM20-NEXT:    shr.u64 %rd2, %rd1, %r2;
 ; SM20-NEXT:    neg.s32 %r3, %r1;
@@ -300,8 +300,8 @@ define i64 @rotr64(i64 %a, i64 %n) {
 ; SM35-NEXT:    .reg .b64 %rd<5>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [rotr64_param_0];
-; SM35-NEXT:    ld.param.u32 %r1, [rotr64_param_1];
+; SM35-NEXT:    ld.param.b64 %rd1, [rotr64_param_0];
+; SM35-NEXT:    ld.param.b32 %r1, [rotr64_param_1];
 ; SM35-NEXT:    and.b32 %r2, %r1, 63;
 ; SM35-NEXT:    shr.u64 %rd2, %rd1, %r2;
 ; SM35-NEXT:    neg.s32 %r3, %r1;
@@ -320,7 +320,7 @@ define i64 @rotr64_low_imm(i64 %a) {
 ; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [rotr64_low_imm_param_0];
+; SM20-NEXT:    ld.param.b64 %rd1, [rotr64_low_imm_param_0];
 ; SM20-NEXT:    shl.b64 %rd2, %rd1, 52;
 ; SM20-NEXT:    shr.u64 %rd3, %rd1, 12;
 ; SM20-NEXT:    or.b64 %rd4, %rd3, %rd2;
@@ -333,7 +333,7 @@ define i64 @rotr64_low_imm(i64 %a) {
 ; SM35-NEXT:    .reg .b64 %rd<3>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [rotr64_low_imm_param_0];
+; SM35-NEXT:    ld.param.b64 %rd1, [rotr64_low_imm_param_0];
 ; SM35-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; SM35-NEXT:    shf.r.wrap.b32 %r3, %r2, %r1, 12;
 ; SM35-NEXT:    shf.r.wrap.b32 %r4, %r1, %r2, 12;
@@ -350,7 +350,7 @@ define i64 @rotr64_high_imm(i64 %a) {
 ; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [rotr64_high_imm_param_0];
+; SM20-NEXT:    ld.param.b64 %rd1, [rotr64_high_imm_param_0];
 ; SM20-NEXT:    shl.b64 %rd2, %rd1, 21;
 ; SM20-NEXT:    shr.u64 %rd3, %rd1, 43;
 ; SM20-NEXT:    or.b64 %rd4, %rd3, %rd2;
@@ -363,7 +363,7 @@ define i64 @rotr64_high_imm(i64 %a) {
 ; SM35-NEXT:    .reg .b64 %rd<3>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [rotr64_high_imm_param_0];
+; SM35-NEXT:    ld.param.b64 %rd1, [rotr64_high_imm_param_0];
 ; SM35-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; SM35-NEXT:    shf.r.wrap.b32 %r3, %r1, %r2, 11;
 ; SM35-NEXT:    shf.r.wrap.b32 %r4, %r2, %r1, 11;
@@ -380,7 +380,7 @@ define i64 @rotr64_32_imm(i64 %a) {
 ; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [rotr64_32_imm_param_0];
+; SM20-NEXT:    ld.param.b64 %rd1, [rotr64_32_imm_param_0];
 ; SM20-NEXT:    shl.b64 %rd2, %rd1, 32;
 ; SM20-NEXT:    shr.u64 %rd3, %rd1, 32;
 ; SM20-NEXT:    or.b64 %rd4, %rd3, %rd2;
@@ -393,7 +393,7 @@ define i64 @rotr64_32_imm(i64 %a) {
 ; SM35-NEXT:    .reg .b64 %rd<3>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [rotr64_32_imm_param_0];
+; SM35-NEXT:    ld.param.b64 %rd1, [rotr64_32_imm_param_0];
 ; SM35-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; SM35-NEXT:    mov.b64 %rd2, {%r2, %r1};
 ; SM35-NEXT:    st.param.b64 [func_retval0], %rd2;
@@ -408,10 +408,10 @@ define i32 @funnel_shift_right_32(i32 %a, i32 %b, i32 %c) {
 ; SM20-NEXT:    .reg .b32 %r<11>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u32 %r1, [funnel_shift_right_32_param_0];
-; SM20-NEXT:    ld.param.u32 %r2, [funnel_shift_right_32_param_2];
+; SM20-NEXT:    ld.param.b32 %r1, [funnel_shift_right_32_param_0];
+; SM20-NEXT:    ld.param.b32 %r2, [funnel_shift_right_32_param_2];
 ; SM20-NEXT:    and.b32 %r3, %r2, 31;
-; SM20-NEXT:    ld.param.u32 %r4, [funnel_shift_right_32_param_1];
+; SM20-NEXT:    ld.param.b32 %r4, [funnel_shift_right_32_param_1];
 ; SM20-NEXT:    shr.u32 %r5, %r4, %r3;
 ; SM20-NEXT:    shl.b32 %r6, %r1, 1;
 ; SM20-NEXT:    not.b32 %r7, %r2;
@@ -426,9 +426,9 @@ define i32 @funnel_shift_right_32(i32 %a, i32 %b, i32 %c) {
 ; SM35-NEXT:    .reg .b32 %r<5>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u32 %r1, [funnel_shift_right_32_param_0];
-; SM35-NEXT:    ld.param.u32 %r2, [funnel_shift_right_32_param_1];
-; SM35-NEXT:    ld.param.u32 %r3, [funnel_shift_right_32_param_2];
+; SM35-NEXT:    ld.param.b32 %r1, [funnel_shift_right_32_param_0];
+; SM35-NEXT:    ld.param.b32 %r2, [funnel_shift_right_32_param_1];
+; SM35-NEXT:    ld.param.b32 %r3, [funnel_shift_right_32_param_2];
 ; SM35-NEXT:    shf.r.wrap.b32 %r4, %r2, %r1, %r3;
 ; SM35-NEXT:    st.param.b32 [func_retval0], %r4;
 ; SM35-NEXT:    ret;
@@ -442,11 +442,11 @@ define i32 @funnel_shift_left_32(i32 %a, i32 %b, i32 %c) {
 ; SM20-NEXT:    .reg .b32 %r<11>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u32 %r1, [funnel_shift_left_32_param_0];
-; SM20-NEXT:    ld.param.u32 %r2, [funnel_shift_left_32_param_2];
+; SM20-NEXT:    ld.param.b32 %r1, [funnel_shift_left_32_param_0];
+; SM20-NEXT:    ld.param.b32 %r2, [funnel_shift_left_32_param_2];
 ; SM20-NEXT:    and.b32 %r3, %r2, 31;
 ; SM20-NEXT:    shl.b32 %r4, %r1, %r3;
-; SM20-NEXT:    ld.param.u32 %r5, [funnel_shift_left_32_param_1];
+; SM20-NEXT:    ld.param.b32 %r5, [funnel_shift_left_32_param_1];
 ; SM20-NEXT:    shr.u32 %r6, %r5, 1;
 ; SM20-NEXT:    not.b32 %r7, %r2;
 ; SM20-NEXT:    and.b32 %r8, %r7, 31;
@@ -460,9 +460,9 @@ define i32 @funnel_shift_left_32(i32 %a, i32 %b, i32 %c) {
 ; SM35-NEXT:    .reg .b32 %r<5>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u32 %r1, [funnel_shift_left_32_param_0];
-; SM35-NEXT:    ld.param.u32 %r2, [funnel_shift_left_32_param_1];
-; SM35-NEXT:    ld.param.u32 %r3, [funnel_shift_left_32_param_2];
+; SM35-NEXT:    ld.param.b32 %r1, [funnel_shift_left_32_param_0];
+; SM35-NEXT:    ld.param.b32 %r2, [funnel_shift_left_32_param_1];
+; SM35-NEXT:    ld.param.b32 %r3, [funnel_shift_left_32_param_2];
 ; SM35-NEXT:    shf.l.wrap.b32 %r4, %r2, %r1, %r3;
 ; SM35-NEXT:    st.param.b32 [func_retval0], %r4;
 ; SM35-NEXT:    ret;
@@ -477,10 +477,10 @@ define i64 @funnel_shift_right_64(i64 %a, i64 %b, i64 %c) {
 ; SM20-NEXT:    .reg .b64 %rd<7>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [funnel_shift_right_64_param_0];
-; SM20-NEXT:    ld.param.u32 %r1, [funnel_shift_right_64_param_2];
+; SM20-NEXT:    ld.param.b64 %rd1, [funnel_shift_right_64_param_0];
+; SM20-NEXT:    ld.param.b32 %r1, [funnel_shift_right_64_param_2];
 ; SM20-NEXT:    and.b32 %r2, %r1, 63;
-; SM20-NEXT:    ld.param.u64 %rd2, [funnel_shift_right_64_param_1];
+; SM20-NEXT:    ld.param.b64 %rd2, [funnel_shift_right_64_param_1];
 ; SM20-NEXT:    shr.u64 %rd3, %rd2, %r2;
 ; SM20-NEXT:    shl.b64 %rd4, %rd1, 1;
 ; SM20-NEXT:    not.b32 %r3, %r1;
@@ -496,10 +496,10 @@ define i64 @funnel_shift_right_64(i64 %a, i64 %b, i64 %c) {
 ; SM35-NEXT:    .reg .b64 %rd<7>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [funnel_shift_right_64_param_0];
-; SM35-NEXT:    ld.param.u32 %r1, [funnel_shift_right_64_param_2];
+; SM35-NEXT:    ld.param.b64 %rd1, [funnel_shift_right_64_param_0];
+; SM35-NEXT:    ld.param.b32 %r1, [funnel_shift_right_64_param_2];
 ; SM35-NEXT:    and.b32 %r2, %r1, 63;
-; SM35-NEXT:    ld.param.u64 %rd2, [funnel_shift_right_64_param_1];
+; SM35-NEXT:    ld.param.b64 %rd2, [funnel_shift_right_64_param_1];
 ; SM35-NEXT:    shr.u64 %rd3, %rd2, %r2;
 ; SM35-NEXT:    shl.b64 %rd4, %rd1, 1;
 ; SM35-NEXT:    not.b32 %r3, %r1;
@@ -519,11 +519,11 @@ define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) {
 ; SM20-NEXT:    .reg .b64 %rd<7>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [funnel_shift_left_64_param_0];
-; SM20-NEXT:    ld.param.u32 %r1, [funnel_shift_left_64_param_2];
+; SM20-NEXT:    ld.param.b64 %rd1, [funnel_shift_left_64_param_0];
+; SM20-NEXT:    ld.param.b32 %r1, [funnel_shift_left_64_param_2];
 ; SM20-NEXT:    and.b32 %r2, %r1, 63;
 ; SM20-NEXT:    shl.b64 %rd2, %rd1, %r2;
-; SM20-NEXT:    ld.param.u64 %rd3, [funnel_shift_left_64_param_1];
+; SM20-NEXT:    ld.param.b64 %rd3, [funnel_shift_left_64_param_1];
 ; SM20-NEXT:    shr.u64 %rd4, %rd3, 1;
 ; SM20-NEXT:    not.b32 %r3, %r1;
 ; SM20-NEXT:    and.b32 %r4, %r3, 63;
@@ -538,11 +538,11 @@ define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) {
 ; SM35-NEXT:    .reg .b64 %rd<7>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [funnel_shift_left_64_param_0];
-; SM35-NEXT:    ld.param.u32 %r1, [funnel_shift_left_64_param_2];
+; SM35-NEXT:    ld.param.b64 %rd1, [funnel_shift_left_64_param_0];
+; SM35-NEXT:    ld.param.b32 %r1, [funnel_shift_left_64_param_2];
 ; SM35-NEXT:    and.b32 %r2, %r1, 63;
 ; SM35-NEXT:    shl.b64 %rd2, %rd1, %r2;
-; SM35-NEXT:    ld.param.u64 %rd3, [funnel_shift_left_64_param_1];
+; SM35-NEXT:    ld.param.b64 %rd3, [funnel_shift_left_64_param_1];
 ; SM35-NEXT:    shr.u64 %rd4, %rd3, 1;
 ; SM35-NEXT:    not.b32 %r3, %r1;
 ; SM35-NEXT:    and.b32 %r4, %r3, 63;
@@ -560,8 +560,8 @@ define i64 @fshl64_low_imm(i64 %a, i64 %b) {
 ; SM20-NEXT:    .reg .b64 %rd<6>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [fshl64_low_imm_param_0];
-; SM20-NEXT:    ld.param.u64 %rd2, [fshl64_low_imm_param_1];
+; SM20-NEXT:    ld.param.b64 %rd1, [fshl64_low_imm_param_0];
+; SM20-NEXT:    ld.param.b64 %rd2, [fshl64_low_imm_param_1];
 ; SM20-NEXT:    shr.u64 %rd3, %rd2, 59;
 ; SM20-NEXT:    shl.b64 %rd4, %rd1, 5;
 ; SM20-NEXT:    or.b64 %rd5, %rd4, %rd3;
@@ -574,9 +574,9 @@ define i64 @fshl64_low_imm(i64 %a, i64 %b) {
 ; SM35-NEXT:    .reg .b64 %rd<4>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [fshl64_low_imm_param_0];
+; SM35-NEXT:    ld.param.b64 %rd1, [fshl64_low_imm_param_0];
 ; SM35-NEXT:    mov.b64 {%r1, %r2}, %rd1;
-; SM35-NEXT:    ld.param.u64 %rd2, [fshl64_low_imm_param_1];
+; SM35-NEXT:    ld.param.b64 %rd2, [fshl64_low_imm_param_1];
 ; SM35-NEXT:    mov.b64 {%r3, %r4}, %rd2;
 ; SM35-NEXT:    shf.l.wrap.b32 %r5, %r4, %r1, 5;
 ; SM35-NEXT:    shf.l.wrap.b32 %r6, %r1, %r2, 5;
@@ -593,8 +593,8 @@ define i64 @fshl64_high_imm(i64 %a, i64 %b) {
 ; SM20-NEXT:    .reg .b64 %rd<6>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [fshl64_high_imm_param_0];
-; SM20-NEXT:    ld.param.u64 %rd2, [fshl64_high_imm_param_1];
+; SM20-NEXT:    ld.param.b64 %rd1, [fshl64_high_imm_param_0];
+; SM20-NEXT:    ld.param.b64 %rd2, [fshl64_high_imm_param_1];
 ; SM20-NEXT:    shr.u64 %rd3, %rd2, 9;
 ; SM20-NEXT:    shl.b64 %rd4, %rd1, 55;
 ; SM20-NEXT:    or.b64 %rd5, %rd4, %rd3;
@@ -607,9 +607,9 @@ define i64 @fshl64_high_imm(i64 %a, i64 %b) {
 ; SM35-NEXT:    .reg .b64 %rd<4>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [fshl64_high_imm_param_0];
+; SM35-NEXT:    ld.param.b64 %rd1, [fshl64_high_imm_param_0];
 ; SM35-NEXT:    mov.b64 {%r1, %r2}, %rd1;
-; SM35-NEXT:    ld.param.u64 %rd2, [fshl64_high_imm_param_1];
+; SM35-NEXT:    ld.param.b64 %rd2, [fshl64_high_imm_param_1];
 ; SM35-NEXT:    mov.b64 {%r3, %r4}, %rd2;
 ; SM35-NEXT:    shf.l.wrap.b32 %r5, %r4, %r1, 23;
 ; SM35-NEXT:    shf.l.wrap.b32 %r6, %r3, %r4, 23;
@@ -626,9 +626,9 @@ define i64 @fshl64_32_imm(i64 %a, i64 %b) {
 ; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [fshl64_32_imm_param_0];
+; SM20-NEXT:    ld.param.b64 %rd1, [fshl64_32_imm_param_0];
 ; SM20-NEXT:    shl.b64 %rd2, %rd1, 32;
-; SM20-NEXT:    ld.param.u32 %rd3, [fshl64_32_imm_param_1+4];
+; SM20-NEXT:    ld.param.b32 %rd3, [fshl64_32_imm_param_1+4];
 ; SM20-NEXT:    or.b64 %rd4, %rd2, %rd3;
 ; SM20-NEXT:    st.param.b64 [func_retval0], %rd4;
 ; SM20-NEXT:    ret;
@@ -639,9 +639,9 @@ define i64 @fshl64_32_imm(i64 %a, i64 %b) {
 ; SM35-NEXT:    .reg .b64 %rd<4>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [fshl64_32_imm_param_0];
+; SM35-NEXT:    ld.param.b64 %rd1, [fshl64_32_imm_param_0];
 ; SM35-NEXT:    mov.b64 {%r1, %r2}, %rd1;
-; SM35-NEXT:    ld.param.u64 %rd2, [fshl64_32_imm_param_1];
+; SM35-NEXT:    ld.param.b64 %rd2, [fshl64_32_imm_param_1];
 ; SM35-NEXT:    mov.b64 {%r3, %r4}, %rd2;
 ; SM35-NEXT:    mov.b64 %rd3, {%r4, %r1};
 ; SM35-NEXT:    st.param.b64 [func_retval0], %rd3;
@@ -656,8 +656,8 @@ define i64 @fshr64_low_imm(i64 %a, i64 %b) {
 ; SM20-NEXT:    .reg .b64 %rd<6>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [fshr64_low_imm_param_0];
-; SM20-NEXT:    ld.param.u64 %rd2, [fshr64_low_imm_param_1];
+; SM20-NEXT:    ld.param.b64 %rd1, [fshr64_low_imm_param_0];
+; SM20-NEXT:    ld.param.b64 %rd2, [fshr64_low_imm_param_1];
 ; SM20-NEXT:    shr.u64 %rd3, %rd2, 31;
 ; SM20-NEXT:    shl.b64 %rd4, %rd1, 33;
 ; SM20-NEXT:    or.b64 %rd5, %rd4, %rd3;
@@ -670,9 +670,9 @@ define i64 @fshr64_low_imm(i64 %a, i64 %b) {
 ; SM35-NEXT:    .reg .b64 %rd<4>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [fshr64_low_imm_param_0];
+; SM35-NEXT:    ld.param.b64 %rd1, [fshr64_low_imm_param_0];
 ; SM35-NEXT:    mov.b64 {%r1, %r2}, %rd1;
-; SM35-NEXT:    ld.param.u64 %rd2, [fshr64_low_imm_param_1];
+; SM35-NEXT:    ld.param.b64 %rd2, [fshr64_low_imm_param_1];
 ; SM35-NEXT:    mov.b64 {%r3, %r4}, %rd2;
 ; SM35-NEXT:    shf.r.wrap.b32 %r5, %r4, %r1, 31;
 ; SM35-NEXT:    shf.r.wrap.b32 %r6, %r3, %r4, 31;
@@ -689,8 +689,8 @@ define i64 @fshr64_high_imm(i64 %a, i64 %b) {
 ; SM20-NEXT:    .reg .b64 %rd<6>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [fshr64_high_imm_param_0];
-; SM20-NEXT:    ld.param.u64 %rd2, [fshr64_high_imm_param_1];
+; SM20-NEXT:    ld.param.b64 %rd1, [fshr64_high_imm_param_0];
+; SM20-NEXT:    ld.param.b64 %rd2, [fshr64_high_imm_param_1];
 ; SM20-NEXT:    shr.u64 %rd3, %rd2, 33;
 ; SM20-NEXT:    shl.b64 %rd4, %rd1, 31;
 ; SM20-NEXT:    or.b64 %rd5, %rd4, %rd3;
@@ -703,9 +703,9 @@ define i64 @fshr64_high_imm(i64 %a, i64 %b) {
 ; SM35-NEXT:    .reg .b64 %rd<4>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [fshr64_high_imm_param_0];
+; SM35-NEXT:    ld.param.b64 %rd1, [fshr64_high_imm_param_0];
 ; SM35-NEXT:    mov.b64 {%r1, %r2}, %rd1;
-; SM35-NEXT:    ld.param.u64 %rd2, [fshr64_high_imm_param_1];
+; SM35-NEXT:    ld.param.b64 %rd2, [fshr64_high_imm_param_1];
 ; SM35-NEXT:    mov.b64 {%r3, %r4}, %rd2;
 ; SM35-NEXT:    shf.r.wrap.b32 %r5, %r4, %r1, 1;
 ; SM35-NEXT:    shf.r.wrap.b32 %r6, %r1, %r2, 1;
@@ -722,9 +722,9 @@ define i64 @fshr64_32_imm(i64 %a, i64 %b) {
 ; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
-; SM20-NEXT:    ld.param.u64 %rd1, [fshr64_32_imm_param_0];
+; SM20-NEXT:    ld.param.b64 %rd1, [fshr64_32_imm_param_0];
 ; SM20-NEXT:    shl.b64 %rd2, %rd1, 32;
-; SM20-NEXT:    ld.param.u32 %rd3, [fshr64_32_imm_param_1+4];
+; SM20-NEXT:    ld.param.b32 %rd3, [fshr64_32_imm_param_1+4];
 ; SM20-NEXT:    or.b64 %rd4, %rd2, %rd3;
 ; SM20-NEXT:    st.param.b64 [func_retval0], %rd4;
 ; SM20-NEXT:    ret;
@@ -735,9 +735,9 @@ define i64 @fshr64_32_imm(i64 %a, i64 %b) {
 ; SM35-NEXT:    .reg .b64 %rd<4>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
-; SM35-NEXT:    ld.param.u64 %rd1, [fshr64_32_imm_param_0];
+; SM35-NEXT:    ld.param.b64 %rd1, [fshr64_32_imm_param_0];
 ; SM35-NEXT:    mov.b64 {%r1, %r2}, %rd1;
-; SM35-NEXT:    ld.param.u64 %rd2, [fshr64_32_imm_param_1];
+; SM35-NEXT:    ld.param.b64 %rd2, [fshr64_32_imm_param_1];
 ; SM35-NEXT:    mov.b64 {%r3, %r4}, %rd2;
 ; SM35-NEXT:    mov.b64 %rd3, {%r4, %r1};
 ; SM35-NEXT:    st.param.b64 [func_retval0], %rd3;
diff --git a/llvm/test/CodeGen/NVPTX/rotate_64.ll b/llvm/test/CodeGen/NVPTX/rotate_64.ll
index 841dc67c68640..c91211a13fdfe 100644
--- a/llvm/test/CodeGen/NVPTX/rotate_64.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate_64.ll
@@ -12,7 +12,7 @@ define i64 @rotate64(i64 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [rotate64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [rotate64_param_0];
 ; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    shf.l.wrap.b32 %r3, %r1, %r2, 3;
 ; CHECK-NEXT:    shf.l.wrap.b32 %r4, %r2, %r1, 3;
@@ -30,7 +30,7 @@ define i64 @rotateright64(i64 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [rotateright64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [rotateright64_param_0];
 ; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    shf.r.wrap.b32 %r3, %r2, %r1, 3;
 ; CHECK-NEXT:    shf.r.wrap.b32 %r4, %r1, %r2, 3;
diff --git a/llvm/test/CodeGen/NVPTX/sad-intrins.ll b/llvm/test/CodeGen/NVPTX/sad-intrins.ll
index 8258dca605e9e..bd80784f62f4e 100644
--- a/llvm/test/CodeGen/NVPTX/sad-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/sad-intrins.ll
@@ -9,9 +9,9 @@ define i16 @test_sad_i16(i16 %x, i16 %y, i16 %z) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [test_sad_i16_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [test_sad_i16_param_1];
-; CHECK-NEXT:    ld.param.u16 %rs3, [test_sad_i16_param_2];
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_sad_i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [test_sad_i16_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [test_sad_i16_param_2];
 ; CHECK-NEXT:    sad.s16 %rs4, %rs1, %rs2, %rs3;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -27,9 +27,9 @@ define i16 @test_sad_u16(i16 %x, i16 %y, i16 %z) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [test_sad_u16_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [test_sad_u16_param_1];
-; CHECK-NEXT:    ld.param.u16 %rs3, [test_sad_u16_param_2];
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_sad_u16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [test_sad_u16_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [test_sad_u16_param_2];
 ; CHECK-NEXT:    sad.u16 %rs4, %rs1, %rs2, %rs3;
 ; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
@@ -44,9 +44,9 @@ define i32 @test_sad_i32(i32 %x, i32 %y, i32 %z) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_sad_i32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_sad_i32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_sad_i32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sad_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_sad_i32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_sad_i32_param_2];
 ; CHECK-NEXT:    sad.s32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -60,9 +60,9 @@ define i32 @test_sad_u32(i32 %x, i32 %y, i32 %z) {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_sad_u32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_sad_u32_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [test_sad_u32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sad_u32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_sad_u32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_sad_u32_param_2];
 ; CHECK-NEXT:    sad.u32 %r4, %r1, %r2, %r3;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
@@ -76,9 +76,9 @@ define i64 @test_sad_i64(i64 %x, i64 %y, i64 %z) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_sad_i64_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_sad_i64_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd3, [test_sad_i64_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sad_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_sad_i64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_sad_i64_param_2];
 ; CHECK-NEXT:    sad.s64 %rd4, %rd1, %rd2, %rd3;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
 ; CHECK-NEXT:    ret;
@@ -92,9 +92,9 @@ define i64 @test_sad_u64(i64 %x, i64 %y, i64 %z) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_sad_u64_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_sad_u64_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd3, [test_sad_u64_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sad_u64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_sad_u64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_sad_u64_param_2];
 ; CHECK-NEXT:    sad.u64 %rd4, %rd1, %rd2, %rd3;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/sched1.ll b/llvm/test/CodeGen/NVPTX/sched1.ll
index e7358157ea54b..09bd4243138a8 100644
--- a/llvm/test/CodeGen/NVPTX/sched1.ll
+++ b/llvm/test/CodeGen/NVPTX/sched1.ll
@@ -5,10 +5,10 @@
 
 define void @foo(ptr %a) {
 ; CHECK: .func foo
-; CHECK: ld.u32
-; CHECK-NEXT: ld.u32
-; CHECK-NEXT: ld.u32
-; CHECK-NEXT: ld.u32
+; CHECK: ld.b32
+; CHECK-NEXT: ld.b32
+; CHECK-NEXT: ld.b32
+; CHECK-NEXT: ld.b32
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
diff --git a/llvm/test/CodeGen/NVPTX/sched2.ll b/llvm/test/CodeGen/NVPTX/sched2.ll
index 950f9f2540421..5518752315426 100644
--- a/llvm/test/CodeGen/NVPTX/sched2.ll
+++ b/llvm/test/CodeGen/NVPTX/sched2.ll
@@ -3,10 +3,10 @@
 
 define void @foo(ptr %a) {
 ; CHECK: .func foo
-; CHECK: ld.v2.u32
-; CHECK-NEXT: ld.v2.u32
-; CHECK-NEXT: ld.v2.u32
-; CHECK-NEXT: ld.v2.u32
+; CHECK: ld.v2.b32
+; CHECK-NEXT: ld.v2.b32
+; CHECK-NEXT: ld.v2.b32
+; CHECK-NEXT: ld.v2.b32
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
 ; CHECK-NEXT: add.s32
diff --git a/llvm/test/CodeGen/NVPTX/sext-params.ll b/llvm/test/CodeGen/NVPTX/sext-params.ll
index 0a502288c473f..a8afcec759fe8 100644
--- a/llvm/test/CodeGen/NVPTX/sext-params.ll
+++ b/llvm/test/CodeGen/NVPTX/sext-params.ll
@@ -11,7 +11,7 @@ define i8 @foo(i8 signext %a) {
 }
 
 define i8 @bar(i8 zeroext %a) {
-; CHECK: ld.param.u8
+; CHECK: ld.param.b8
   %ret = add i8 %a, 3
   ret i8 %ret
 }
diff --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll
index ba5291a6a95da..802954bda6a9f 100644
--- a/llvm/test/CodeGen/NVPTX/sext-setcc.ll
+++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll
@@ -11,8 +11,8 @@ define <2 x i16> @sext_setcc_v2i1_to_v2i16(ptr %p) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u64 %rd1, [sext_setcc_v2i1_to_v2i16_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [sext_setcc_v2i1_to_v2i16_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    setp.eq.s16 %p1, %rs1, 0;
 ; CHECK-NEXT:    setp.eq.s16 %p2, %rs2, 0;
@@ -37,8 +37,8 @@ define <4 x i8> @sext_setcc_v4i1_to_v4i8(ptr %p) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0];
-; CHECK-NEXT:    ld.u32 %r1, [%rd1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1];
 ; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    and.b16 %rs2, %rs1, 255;
diff --git a/llvm/test/CodeGen/NVPTX/shfl-p.ll b/llvm/test/CodeGen/NVPTX/shfl-p.ll
index a631740cf36d6..756998196fdec 100644
--- a/llvm/test/CodeGen/NVPTX/shfl-p.ll
+++ b/llvm/test/CodeGen/NVPTX/shfl-p.ll
@@ -12,9 +12,9 @@ declare {float, i1} @llvm.nvvm.shfl.idx.f32p(float, i32, i32)
 
 ; CHECK-LABEL: .func{{.*}}shfl_i32_rrr
 define {i32, i1} @shfl_i32_rrr(i32 %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 %c)
@@ -23,9 +23,9 @@ define {i32, i1} @shfl_i32_rrr(i32 %a, i32 %b, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_i32_irr
 define {i32, i1} @shfl_i32_irr(i32 %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 %c)
@@ -34,8 +34,8 @@ define {i32, i1} @shfl_i32_irr(i32 %a, i32 %b, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_i32_rri
 define {i32, i1} @shfl_i32_rri(i32 %a, i32 %b) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 1)
@@ -44,8 +44,8 @@ define {i32, i1} @shfl_i32_rri(i32 %a, i32 %b) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_i32_iri
 define {i32, i1} @shfl_i32_iri(i32 %a, i32 %b) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 2)
@@ -54,8 +54,8 @@ define {i32, i1} @shfl_i32_iri(i32 %a, i32 %b) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_i32_rir
 define {i32, i1} @shfl_i32_rir(i32 %a, i32 %c) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 1, i32 %c)
@@ -64,8 +64,8 @@ define {i32, i1} @shfl_i32_rir(i32 %a, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_i32_iir
 define {i32, i1} @shfl_i32_iir(i32 %a, i32 %c) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 2, i32 %c)
@@ -74,7 +74,7 @@ define {i32, i1} @shfl_i32_iir(i32 %a, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_i32_rii
 define {i32, i1} @shfl_i32_rii(i32 %a) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 1, i32 2)
@@ -83,7 +83,7 @@ define {i32, i1} @shfl_i32_rii(i32 %a) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_i32_iii
 define {i32, i1} @shfl_i32_iii(i32 %a, i32 %b) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 2, i32 3)
@@ -94,9 +94,9 @@ define {i32, i1} @shfl_i32_iii(i32 %a, i32 %b) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_f32_rrr
 define {float, i1} @shfl_f32_rrr(float %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 %c)
@@ -105,9 +105,9 @@ define {float, i1} @shfl_f32_rrr(float %a, i32 %b, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_f32_irr
 define {float, i1} @shfl_f32_irr(float %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 %c)
@@ -116,8 +116,8 @@ define {float, i1} @shfl_f32_irr(float %a, i32 %b, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_f32_rri
 define {float, i1} @shfl_f32_rri(float %a, i32 %b) {
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 1)
@@ -126,8 +126,8 @@ define {float, i1} @shfl_f32_rri(float %a, i32 %b) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_f32_iri
 define {float, i1} @shfl_f32_iri(float %a, i32 %b) {
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 2)
@@ -136,8 +136,8 @@ define {float, i1} @shfl_f32_iri(float %a, i32 %b) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_f32_rir
 define {float, i1} @shfl_f32_rir(float %a, i32 %c) {
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 1, i32 %c)
@@ -146,8 +146,8 @@ define {float, i1} @shfl_f32_rir(float %a, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_f32_iir
 define {float, i1} @shfl_f32_iir(float %a, i32 %c) {
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 2, i32 %c)
@@ -156,7 +156,7 @@ define {float, i1} @shfl_f32_iir(float %a, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_f32_rii
 define {float, i1} @shfl_f32_rii(float %a) {
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 1, i32 2)
@@ -165,7 +165,7 @@ define {float, i1} @shfl_f32_rii(float %a) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_f32_iii
 define {float, i1} @shfl_f32_iii(float %a, i32 %b) {
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 2, i32 3)
diff --git a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll
index 20f5c571e9d2e..74890dc4fed20 100644
--- a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll
+++ b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll
@@ -12,10 +12,10 @@ declare {float, i1} @llvm.nvvm.shfl.sync.idx.f32p(i32, float, i32, i32)
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_rrr
 define {i32, i1} @shfl_sync_i32_rrr(i32 %mask, i32 %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 %b, i32 %c)
@@ -24,9 +24,9 @@ define {i32, i1} @shfl_sync_i32_rrr(i32 %mask, i32 %a, i32 %b, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_irr
 define {i32, i1} @shfl_sync_i32_irr(i32 %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 %b, i32 %c)
@@ -35,9 +35,9 @@ define {i32, i1} @shfl_sync_i32_irr(i32 %a, i32 %b, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_rri
 define {i32, i1} @shfl_sync_i32_rri(i32 %mask, i32 %a, i32 %b) {
-  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 %b, i32 1)
@@ -46,8 +46,8 @@ define {i32, i1} @shfl_sync_i32_rri(i32 %mask, i32 %a, i32 %b) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_iri
 define {i32, i1} @shfl_sync_i32_iri(i32 %a, i32 %b) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 %b, i32 2)
@@ -56,9 +56,9 @@ define {i32, i1} @shfl_sync_i32_iri(i32 %a, i32 %b) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_rir
 define {i32, i1} @shfl_sync_i32_rir(i32 %mask, i32 %a, i32 %c) {
-  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 1, i32 %c)
@@ -67,8 +67,8 @@ define {i32, i1} @shfl_sync_i32_rir(i32 %mask, i32 %a, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_iir
 define {i32, i1} @shfl_sync_i32_iir(i32 %a, i32 %c) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 2, i32 %c)
@@ -77,8 +77,8 @@ define {i32, i1} @shfl_sync_i32_iir(i32 %a, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_rii
 define {i32, i1} @shfl_sync_i32_rii(i32 %mask, i32 %a) {
-  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 1, i32 2)
@@ -87,7 +87,7 @@ define {i32, i1} @shfl_sync_i32_rii(i32 %mask, i32 %a) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_iii
 define {i32, i1} @shfl_sync_i32_iii(i32 %a, i32 %b) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 2, i32 3)
@@ -98,10 +98,10 @@ define {i32, i1} @shfl_sync_i32_iii(i32 %a, i32 %b) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rrr
 define {float, i1} @shfl_sync_f32_rrr(i32 %mask, float %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 %c)
@@ -110,9 +110,9 @@ define {float, i1} @shfl_sync_f32_rrr(i32 %mask, float %a, i32 %b, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_irr
 define {float, i1} @shfl_sync_f32_irr(float %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 %c)
@@ -121,9 +121,9 @@ define {float, i1} @shfl_sync_f32_irr(float %a, i32 %b, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rri
 define {float, i1} @shfl_sync_f32_rri(i32 %mask, float %a, i32 %b) {
-  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 1)
@@ -132,8 +132,8 @@ define {float, i1} @shfl_sync_f32_rri(i32 %mask, float %a, i32 %b) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_iri
 define {float, i1} @shfl_sync_f32_iri(float %a, i32 %b) {
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 2)
@@ -142,9 +142,9 @@ define {float, i1} @shfl_sync_f32_iri(float %a, i32 %b) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rir
 define {float, i1} @shfl_sync_f32_rir(i32 %mask, float %a, i32 %c) {
-  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 %c)
@@ -153,8 +153,8 @@ define {float, i1} @shfl_sync_f32_rir(i32 %mask, float %a, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_iir
 define {float, i1} @shfl_sync_f32_iir(float %a, i32 %c) {
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 %c)
@@ -163,8 +163,8 @@ define {float, i1} @shfl_sync_f32_iir(float %a, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rii
 define {float, i1} @shfl_sync_f32_rii(i32 %mask, float %a) {
-  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 2)
@@ -173,7 +173,7 @@ define {float, i1} @shfl_sync_f32_rii(i32 %mask, float %a) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_iii
 define {float, i1} @shfl_sync_f32_iii(float %a, i32 %b) {
-  ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%f[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 3)
diff --git a/llvm/test/CodeGen/NVPTX/shfl-sync.ll b/llvm/test/CodeGen/NVPTX/shfl-sync.ll
index a7e2932e61d37..0c826d221d056 100644
--- a/llvm/test/CodeGen/NVPTX/shfl-sync.ll
+++ b/llvm/test/CodeGen/NVPTX/shfl-sync.ll
@@ -12,10 +12,10 @@ declare float @llvm.nvvm.shfl.sync.idx.f32(float, i32, i32, i32)
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_rrr
 define i32 @shfl_sync_rrr(i32 %mask, i32 %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], [[C]], [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 %b, i32 %c)
@@ -24,9 +24,9 @@ define i32 @shfl_sync_rrr(i32 %mask, i32 %a, i32 %b, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_irr
 define i32 @shfl_sync_irr(i32 %a, i32 %b, i32 %c) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], [[C]], 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 %b, i32 %c)
@@ -35,9 +35,9 @@ define i32 @shfl_sync_irr(i32 %a, i32 %b, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_rri
 define i32 @shfl_sync_rri(i32 %mask, i32 %a, i32 %b) {
-  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], 1, [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 %b, i32 1)
@@ -46,8 +46,8 @@ define i32 @shfl_sync_rri(i32 %mask, i32 %a, i32 %b) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_iri
 define i32 @shfl_sync_iri(i32 %a, i32 %b) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[B:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], 2, 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 %b, i32 2)
@@ -56,9 +56,9 @@ define i32 @shfl_sync_iri(i32 %a, i32 %b) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_rir
 define i32 @shfl_sync_rir(i32 %mask, i32 %a, i32 %c) {
-  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 1, [[C]], [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 1, i32 %c)
@@ -67,8 +67,8 @@ define i32 @shfl_sync_rir(i32 %mask, i32 %a, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_iir
 define i32 @shfl_sync_iir(i32 %a, i32 %c) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[C:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 2, [[C]], 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 2, i32 %c)
@@ -77,8 +77,8 @@ define i32 @shfl_sync_iir(i32 %a, i32 %c) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_rii
 define i32 @shfl_sync_rii(i32 %mask, i32 %a) {
-  ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 1, 2, [[MASK]];
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 1, i32 2)
@@ -87,7 +87,7 @@ define i32 @shfl_sync_rii(i32 %mask, i32 %a) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_sync_iii
 define i32 @shfl_sync_iii(i32 %a, i32 %b) {
-  ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[A:%r[0-9]+]]
   ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 2, 3, 1;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 2, i32 3)
diff --git a/llvm/test/CodeGen/NVPTX/shfl.ll b/llvm/test/CodeGen/NVPTX/shfl.ll
index fbf4ea4cd1c74..8aedba26b56bc 100644
--- a/llvm/test/CodeGen/NVPTX/shfl.ll
+++ b/llvm/test/CodeGen/NVPTX/shfl.ll
@@ -15,7 +15,7 @@ declare float @llvm.nvvm.shfl.idx.f32(float, i32, i32)
 
 ; CHECK-LABEL: .func{{.*}}shfl_down1
 define i32 @shfl_down1(i32 %in) {
-  ; CHECK: ld.param.u32 [[IN:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[IN:%r[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]], [[IN]], 1, 2;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 1, i32 2)
@@ -24,8 +24,8 @@ define i32 @shfl_down1(i32 %in) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_down2
 define i32 @shfl_down2(i32 %in, i32 %width) {
-  ; CHECK: ld.param.u32 [[IN1:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[IN2:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[IN1:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[IN2:%r[0-9]+]]
   ; CHECK: shfl.down.{{.}}32 %r{{[0-9]+}}, [[IN1]], [[IN2]], 3;
   %val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 %width, i32 3)
   ret i32 %val
@@ -33,8 +33,8 @@ define i32 @shfl_down2(i32 %in, i32 %width) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_down3
 define i32 @shfl_down3(i32 %in, i32 %mask) {
-  ; CHECK: ld.param.u32 [[IN1:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[IN2:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[IN1:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[IN2:%r[0-9]+]]
   ; CHECK: shfl.down.{{.}}32 %r{{[0-9]+}}, [[IN1]], 4, [[IN2]];
   %val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 4, i32 %mask)
   ret i32 %val
@@ -42,9 +42,9 @@ define i32 @shfl_down3(i32 %in, i32 %mask) {
 
 ; CHECK-LABEL: .func{{.*}}shfl_down4
 define i32 @shfl_down4(i32 %in, i32 %width, i32 %mask) {
-  ; CHECK: ld.param.u32 [[IN1:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[IN2:%r[0-9]+]]
-  ; CHECK: ld.param.u32 [[IN3:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[IN1:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[IN2:%r[0-9]+]]
+  ; CHECK: ld.param.b32 [[IN3:%r[0-9]+]]
   ; CHECK: shfl.down.{{.}}32 %r{{[0-9]+}}, [[IN1]], [[IN2]], [[IN3]];
   %val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 %width, i32 %mask)
   ret i32 %val
@@ -53,7 +53,7 @@ define i32 @shfl_down4(i32 %in, i32 %width, i32 %mask) {
 ; Try shfl.down with floating-point params.
 ; CHECK-LABEL: .func{{.*}}shfl_down_float
 define float @shfl_down_float(float %in) {
-  ; CHECK: ld.param.f32 [[IN:%f[0-9]+]]
+  ; CHECK: ld.param.b32 [[IN:%f[0-9]+]]
   ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]], [[IN]], 5, 6;
   ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
   %out = call float @llvm.nvvm.shfl.down.f32(float %in, i32 5, i32 6)
diff --git a/llvm/test/CodeGen/NVPTX/short-ptr.ll b/llvm/test/CodeGen/NVPTX/short-ptr.ll
index 55828fa9ec80f..eb058955e0aa1 100644
--- a/llvm/test/CodeGen/NVPTX/short-ptr.ll
+++ b/llvm/test/CodeGen/NVPTX/short-ptr.ll
@@ -22,9 +22,9 @@ declare void @use(i8 %arg);
 ; CHECK-DEFAULT-32: .param .b32 test1_param_0
 ; CHECK-SHORT-LOCAL: .param .b32 test1_param_0
 define void @test1(ptr addrspace(5) %local) {
-  ; CHECK-DEFAULT: ld.param.u64 %rd{{.*}}, [test1_param_0];
-  ; CHECK-DEFAULT-32:  ld.param.u32 %r{{.*}}, [test1_param_0];
-  ; CHECK-SHORT-LOCAL: ld.param.u32 %r{{.*}}, [test1_param_0];
+  ; CHECK-DEFAULT: ld.param.b64 %rd{{.*}}, [test1_param_0];
+  ; CHECK-DEFAULT-32:  ld.param.b32 %r{{.*}}, [test1_param_0];
+  ; CHECK-SHORT-LOCAL: ld.param.b32 %r{{.*}}, [test1_param_0];
   %v = load i8, ptr addrspace(5) %local
   call void @use(i8 %v)
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll b/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll
index a6a286e608ced..d79029f124d8a 100644
--- a/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll
+++ b/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll
@@ -9,14 +9,14 @@ define void @kernel_func(ptr %in.vec, ptr %out.vec0) nounwind {
 ; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [kernel_func_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [kernel_func_param_0];
 ; CHECK-NEXT:    ld.v4.b32 {%r2, %r3, %r4, %r5}, [%r1];
 ; CHECK-NEXT:    ld.v4.b32 {%r6, %r7, %r8, %r9}, [%r1+16];
-; CHECK-NEXT:    ld.param.u32 %r10, [kernel_func_param_1];
+; CHECK-NEXT:    ld.param.b32 %r10, [kernel_func_param_1];
 ; CHECK-NEXT:    prmt.b32 %r11, %r6, %r8, 0x4000U;
 ; CHECK-NEXT:    prmt.b32 %r12, %r2, %r4, 0x40U;
 ; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x7610U;
-; CHECK-NEXT:    st.u32 [%r10], %r13;
+; CHECK-NEXT:    st.b32 [%r10], %r13;
 ; CHECK-NEXT:    ret;
   %wide.vec = load <32 x i8>, ptr %in.vec, align 64
   %vec0 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
diff --git a/llvm/test/CodeGen/NVPTX/st-addrspace.ll b/llvm/test/CodeGen/NVPTX/st-addrspace.ll
index daccaaf57d521..d2b3f2b61ffb5 100644
--- a/llvm/test/CodeGen/NVPTX/st-addrspace.ll
+++ b/llvm/test/CodeGen/NVPTX/st-addrspace.ll
@@ -8,24 +8,24 @@
 ;; i8
 ; ALL-LABEL: st_global_i8
 define void @st_global_i8(ptr addrspace(1) %ptr, i8 %a) {
-; G32: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
-; G64: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+; G32: st.global.b8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; G64: st.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; ALL: ret
   store i8 %a, ptr addrspace(1) %ptr
   ret void
 }
 ; ALL-LABEL: st_shared_i8
 define void @st_shared_i8(ptr addrspace(3) %ptr, i8 %a) {
-; LS32: st.shared.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
-; LS64: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+; LS32: st.shared.b8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; LS64: st.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; ALL: ret
   store i8 %a, ptr addrspace(3) %ptr
   ret void
 }
 ; ALL-LABEL: st_local_i8
 define void @st_local_i8(ptr addrspace(5) %ptr, i8 %a) {
-; LS32: st.local.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
-; LS64: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+; LS32: st.local.b8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; LS64: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; ALL: ret
   store i8 %a, ptr addrspace(5) %ptr
   ret void
@@ -34,24 +34,24 @@ define void @st_local_i8(ptr addrspace(5) %ptr, i8 %a) {
 ;; i16
 ; ALL-LABEL: st_global_i16
 define void @st_global_i16(ptr addrspace(1) %ptr, i16 %a) {
-; G32: st.global.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
-; G64: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+; G32: st.global.b16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; G64: st.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; ALL: ret
   store i16 %a, ptr addrspace(1) %ptr
   ret void
 }
 ; ALL-LABEL: st_shared_i16
 define void @st_shared_i16(ptr addrspace(3) %ptr, i16 %a) {
-; LS32: st.shared.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
-; LS64: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+; LS32: st.shared.b16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; LS64: st.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; ALL: ret
   store i16 %a, ptr addrspace(3) %ptr
   ret void
 }
 ; ALL-LABEL: st_local_i16
 define void @st_local_i16(ptr addrspace(5) %ptr, i16 %a) {
-; LS32: st.local.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
-; LS64: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+; LS32: st.local.b16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; LS64: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; ALL: ret
   store i16 %a, ptr addrspace(5) %ptr
   ret void
@@ -60,24 +60,24 @@ define void @st_local_i16(ptr addrspace(5) %ptr, i16 %a) {
 ;; i32
 ; ALL-LABEL: st_global_i32
 define void @st_global_i32(ptr addrspace(1) %ptr, i32 %a) {
-; G32: st.global.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
-; G64: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+; G32: st.global.b32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; G64: st.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; ALL: ret
   store i32 %a, ptr addrspace(1) %ptr
   ret void
 }
 ; ALL-LABEL: st_shared_i32
 define void @st_shared_i32(ptr addrspace(3) %ptr, i32 %a) {
-; LS32: st.shared.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
-; LS64: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+; LS32: st.shared.b32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; LS64: st.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX64: ret
   store i32 %a, ptr addrspace(3) %ptr
   ret void
 }
 ; ALL-LABEL: st_local_i32
 define void @st_local_i32(ptr addrspace(5) %ptr, i32 %a) {
-; LS32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
-; LS64: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+; LS32: st.local.b32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; LS64: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; ALL: ret
   store i32 %a, ptr addrspace(5) %ptr
   ret void
@@ -86,24 +86,24 @@ define void @st_local_i32(ptr addrspace(5) %ptr, i32 %a) {
 ;; i64
 ; ALL-LABEL: st_global_i64
 define void @st_global_i64(ptr addrspace(1) %ptr, i64 %a) {
-; G32: st.global.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
-; G64: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+; G32: st.global.b64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
+; G64: st.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; ALL: ret
   store i64 %a, ptr addrspace(1) %ptr
   ret void
 }
 ; ALL-LABEL: st_shared_i64
 define void @st_shared_i64(ptr addrspace(3) %ptr, i64 %a) {
-; LS32: st.shared.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
-; LS64: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+; LS32: st.shared.b64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
+; LS64: st.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; ALL: ret
   store i64 %a, ptr addrspace(3) %ptr
   ret void
 }
 ; ALL-LABEL: st_local_i64
 define void @st_local_i64(ptr addrspace(5) %ptr, i64 %a) {
-; LS32: st.local.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
-; LS64: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+; LS32: st.local.b64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
+; LS64: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; ALL: ret
   store i64 %a, ptr addrspace(5) %ptr
   ret void
@@ -112,24 +112,24 @@ define void @st_local_i64(ptr addrspace(5) %ptr, i64 %a) {
 ;; f32
 ; ALL-LABEL: st_global_f32
 define void @st_global_f32(ptr addrspace(1) %ptr, float %a) {
-; G32: st.global.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
-; G64: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+; G32: st.global.b32 [%r{{[0-9]+}}], %f{{[0-9]+}}
+; G64: st.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
 ; ALL: ret
   store float %a, ptr addrspace(1) %ptr
   ret void
 }
 ; ALL-LABEL: st_shared_f32
 define void @st_shared_f32(ptr addrspace(3) %ptr, float %a) {
-; LS32: st.shared.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
-; LS64: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+; LS32: st.shared.b32 [%r{{[0-9]+}}], %f{{[0-9]+}}
+; LS64: st.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
 ; ALL: ret
   store float %a, ptr addrspace(3) %ptr
   ret void
 }
 ; ALL-LABEL: st_local_f32
 define void @st_local_f32(ptr addrspace(5) %ptr, float %a) {
-; LS32: st.local.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
-; LS64: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+; LS32: st.local.b32 [%r{{[0-9]+}}], %f{{[0-9]+}}
+; LS64: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
 ; ALL: ret
   store float %a, ptr addrspace(5) %ptr
   ret void
@@ -138,24 +138,24 @@ define void @st_local_f32(ptr addrspace(5) %ptr, float %a) {
 ;; f64
 ; ALL-LABEL: st_global_f64
 define void @st_global_f64(ptr addrspace(1) %ptr, double %a) {
-; G32: st.global.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
-; G64: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+; G32: st.global.b64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
+; G64: st.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
 ; ALL: ret
   store double %a, ptr addrspace(1) %ptr
   ret void
 }
 ; ALL-LABEL: st_shared_f64
 define void @st_shared_f64(ptr addrspace(3) %ptr, double %a) {
-; LS32: st.shared.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
-; LS64: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+; LS32: st.shared.b64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
+; LS64: st.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
 ; ALL: ret
   store double %a, ptr addrspace(3) %ptr
   ret void
 }
 ; ALL-LABEL: st_local_f64
 define void @st_local_f64(ptr addrspace(5) %ptr, double %a) {
-; LS32: st.local.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
-; LS64: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+; LS32: st.local.b64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
+; LS64: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
 ; ALL: ret
   store double %a, ptr addrspace(5) %ptr
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/st-generic.ll b/llvm/test/CodeGen/NVPTX/st-generic.ll
index c5062ed10e79f..cdf9dba825518 100644
--- a/llvm/test/CodeGen/NVPTX/st-generic.ll
+++ b/llvm/test/CodeGen/NVPTX/st-generic.ll
@@ -6,9 +6,9 @@
 ;; i8
 
 define void @st_global_i8(ptr addrspace(0) %ptr, i8 %a) {
-; PTX32: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX32: st.b8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i8 %a, ptr addrspace(0) %ptr
   ret void
@@ -17,9 +17,9 @@ define void @st_global_i8(ptr addrspace(0) %ptr, i8 %a) {
 ;; i16
 
 define void @st_global_i16(ptr addrspace(0) %ptr, i16 %a) {
-; PTX32: st.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX32: st.b16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i16 %a, ptr addrspace(0) %ptr
   ret void
@@ -28,9 +28,9 @@ define void @st_global_i16(ptr addrspace(0) %ptr, i16 %a) {
 ;; i32
 
 define void @st_global_i32(ptr addrspace(0) %ptr, i32 %a) {
-; PTX32: st.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; PTX32: st.b32 [%r{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: st.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX64: ret
   store i32 %a, ptr addrspace(0) %ptr
   ret void
@@ -39,9 +39,9 @@ define void @st_global_i32(ptr addrspace(0) %ptr, i32 %a) {
 ;; i64
 
 define void @st_global_i64(ptr addrspace(0) %ptr, i64 %a) {
-; PTX32: st.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
+; PTX32: st.b64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+; PTX64: st.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX64: ret
   store i64 %a, ptr addrspace(0) %ptr
   ret void
@@ -50,9 +50,9 @@ define void @st_global_i64(ptr addrspace(0) %ptr, i64 %a) {
 ;; f32
 
 define void @st_global_f32(ptr addrspace(0) %ptr, float %a) {
-; PTX32: st.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
+; PTX32: st.b32 [%r{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX64: ret
   store float %a, ptr addrspace(0) %ptr
   ret void
@@ -61,9 +61,9 @@ define void @st_global_f32(ptr addrspace(0) %ptr, float %a) {
 ;; f64
 
 define void @st_global_f64(ptr addrspace(0) %ptr, double %a) {
-; PTX32: st.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
+; PTX32: st.b64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+; PTX64: st.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX64: ret
   store double %a, ptr addrspace(0) %ptr
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
index 0e67e52d52dab..5f1ea5d7b1e26 100644
--- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll
+++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
@@ -87,7 +87,7 @@ define void @st_param_f32() {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 3, 0
 ; CHECK-NEXT:    .param .b32 param0;
-; CHECK-NEXT:    st.param.f32 [param0], 0f40A00000;
+; CHECK-NEXT:    st.param.b32 [param0], 0f40A00000;
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_f32,
 ; CHECK-NEXT:    (
@@ -107,7 +107,7 @@ define void @st_param_f64() {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 4, 0
 ; CHECK-NEXT:    .param .b64 param0;
-; CHECK-NEXT:    st.param.f64 [param0], 0d4018000000000000;
+; CHECK-NEXT:    st.param.b64 [param0], 0d4018000000000000;
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_f64,
 ; CHECK-NEXT:    (
@@ -150,7 +150,7 @@ define void @st_param_v2_i8_ir(i8 %val) {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v2_i8_ir_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v2_i8_ir_param_0];
 ; CHECK-NEXT:    { // callseq 6, 0
 ; CHECK-NEXT:    .param .align 2 .b8 param0[2];
 ; CHECK-NEXT:    st.param.v2.b8 [param0], {1, %rs1};
@@ -172,7 +172,7 @@ define void @st_param_v2_i8_ri(i8 %val) {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v2_i8_ri_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v2_i8_ri_param_0];
 ; CHECK-NEXT:    { // callseq 7, 0
 ; CHECK-NEXT:    .param .align 2 .b8 param0[2];
 ; CHECK-NEXT:    st.param.v2.b8 [param0], {%rs1, 2};
@@ -214,7 +214,7 @@ define void @st_param_v2_i16_ir(i16 %val) {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v2_i16_ir_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v2_i16_ir_param_0];
 ; CHECK-NEXT:    { // callseq 9, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v2.b16 [param0], {1, %rs1};
@@ -236,7 +236,7 @@ define void @st_param_v2_i16_ri(i16 %val) {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v2_i16_ri_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v2_i16_ri_param_0];
 ; CHECK-NEXT:    { // callseq 10, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v2.b16 [param0], {%rs1, 2};
@@ -278,7 +278,7 @@ define void @st_param_v2_i32_ir(i32 %val) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v2_i32_ir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v2_i32_ir_param_0];
 ; CHECK-NEXT:    { // callseq 12, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.b32 [param0], {1, %r1};
@@ -300,7 +300,7 @@ define void @st_param_v2_i32_ri(i32 %val) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v2_i32_ri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v2_i32_ri_param_0];
 ; CHECK-NEXT:    { // callseq 13, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v2.b32 [param0], {%r1, 2};
@@ -342,7 +342,7 @@ define void @st_param_v2_i64_ir(i64 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [st_param_v2_i64_ir_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [st_param_v2_i64_ir_param_0];
 ; CHECK-NEXT:    { // callseq 15, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.b64 [param0], {1, %rd1};
@@ -364,7 +364,7 @@ define void @st_param_v2_i64_ri(i64 %val) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [st_param_v2_i64_ri_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [st_param_v2_i64_ri_param_0];
 ; CHECK-NEXT:    { // callseq 16, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v2.b64 [param0], {%rd1, 2};
@@ -389,7 +389,7 @@ define void @st_param_v2_f32_ii(float %val) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 17, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v2.f32 [param0], {0f3F800000, 0f40000000};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {0f3F800000, 0f40000000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v2_f32,
 ; CHECK-NEXT:    (
@@ -406,10 +406,10 @@ define void @st_param_v2_f32_ir(float %val) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v2_f32_ir_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v2_f32_ir_param_0];
 ; CHECK-NEXT:    { // callseq 18, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v2.f32 [param0], {0f3F800000, %f1};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {0f3F800000, %f1};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v2_f32,
 ; CHECK-NEXT:    (
@@ -428,10 +428,10 @@ define void @st_param_v2_f32_ri(float %val) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v2_f32_ri_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v2_f32_ri_param_0];
 ; CHECK-NEXT:    { // callseq 19, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
-; CHECK-NEXT:    st.param.v2.f32 [param0], {%f1, 0f40000000};
+; CHECK-NEXT:    st.param.v2.b32 [param0], {%f1, 0f40000000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v2_f32,
 ; CHECK-NEXT:    (
@@ -453,7 +453,7 @@ define void @st_param_v2_f64_ii(double %val) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 20, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v2.f64 [param0], {0d3FF0000000000000, 0d4000000000000000};
+; CHECK-NEXT:    st.param.v2.b64 [param0], {0d3FF0000000000000, 0d4000000000000000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v2_f64,
 ; CHECK-NEXT:    (
@@ -470,10 +470,10 @@ define void @st_param_v2_f64_ir(double %val) {
 ; CHECK-NEXT:    .reg .b64 %fd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [st_param_v2_f64_ir_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [st_param_v2_f64_ir_param_0];
 ; CHECK-NEXT:    { // callseq 21, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v2.f64 [param0], {0d3FF0000000000000, %fd1};
+; CHECK-NEXT:    st.param.v2.b64 [param0], {0d3FF0000000000000, %fd1};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v2_f64,
 ; CHECK-NEXT:    (
@@ -492,10 +492,10 @@ define void @st_param_v2_f64_ri(double %val) {
 ; CHECK-NEXT:    .reg .b64 %fd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f64 %fd1, [st_param_v2_f64_ri_param_0];
+; CHECK-NEXT:    ld.param.b64 %fd1, [st_param_v2_f64_ri_param_0];
 ; CHECK-NEXT:    { // callseq 22, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v2.f64 [param0], {%fd1, 0d4000000000000000};
+; CHECK-NEXT:    st.param.v2.b64 [param0], {%fd1, 0d4000000000000000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v2_f64,
 ; CHECK-NEXT:    (
@@ -541,9 +541,9 @@ define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_irrr_param_0];
-; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_irrr_param_1];
-; CHECK-NEXT:    ld.param.u8 %rs3, [st_param_v4_i8_irrr_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irrr_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_irrr_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_irrr_param_2];
 ; CHECK-NEXT:    { // callseq 24, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs1, %rs2, %rs3};
@@ -567,9 +567,9 @@ define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_rirr_param_0];
-; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_rirr_param_1];
-; CHECK-NEXT:    ld.param.u8 %rs3, [st_param_v4_i8_rirr_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rirr_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_rirr_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_rirr_param_2];
 ; CHECK-NEXT:    { // callseq 25, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, 2, %rs2, %rs3};
@@ -593,9 +593,9 @@ define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_rrir_param_0];
-; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_rrir_param_1];
-; CHECK-NEXT:    ld.param.u8 %rs3, [st_param_v4_i8_rrir_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rrir_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_rrir_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_rrir_param_2];
 ; CHECK-NEXT:    { // callseq 26, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, %rs2, 3, %rs3};
@@ -619,9 +619,9 @@ define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) {
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_rrri_param_0];
-; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_rrri_param_1];
-; CHECK-NEXT:    ld.param.u8 %rs3, [st_param_v4_i8_rrri_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rrri_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_rrri_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_rrri_param_2];
 ; CHECK-NEXT:    { // callseq 27, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, %rs2, %rs3, 4};
@@ -645,8 +645,8 @@ define void @st_param_v4_i8_iirr(i8 %c, i8 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_iirr_param_0];
-; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_iirr_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_iirr_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_iirr_param_1];
 ; CHECK-NEXT:    { // callseq 28, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, 2, %rs1, %rs2};
@@ -670,8 +670,8 @@ define void @st_param_v4_i8_irir(i8 %b, i8 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_irir_param_0];
-; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_irir_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irir_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_irir_param_1];
 ; CHECK-NEXT:    { // callseq 29, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs1, 3, %rs2};
@@ -695,8 +695,8 @@ define void @st_param_v4_i8_irri(i8 %b, i8 %c) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_irri_param_0];
-; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_irri_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irri_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_irri_param_1];
 ; CHECK-NEXT:    { // callseq 30, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs1, %rs2, 4};
@@ -720,8 +720,8 @@ define void @st_param_v4_i8_riir(i8 %a, i8 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_riir_param_0];
-; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_riir_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_riir_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_riir_param_1];
 ; CHECK-NEXT:    { // callseq 31, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, 2, 3, %rs2};
@@ -745,8 +745,8 @@ define void @st_param_v4_i8_riri(i8 %a, i8 %c) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_riri_param_0];
-; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_riri_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_riri_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_riri_param_1];
 ; CHECK-NEXT:    { // callseq 32, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, 2, %rs2, 4};
@@ -770,8 +770,8 @@ define void @st_param_v4_i8_rrii(i8 %a, i8 %b) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_rrii_param_0];
-; CHECK-NEXT:    ld.param.u8 %rs2, [st_param_v4_i8_rrii_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rrii_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_rrii_param_1];
 ; CHECK-NEXT:    { // callseq 33, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, %rs2, 3, 4};
@@ -795,7 +795,7 @@ define void @st_param_v4_i8_iiir(i8 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_iiir_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_iiir_param_0];
 ; CHECK-NEXT:    { // callseq 34, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, 2, 3, %rs1};
@@ -819,7 +819,7 @@ define void @st_param_v4_i8_iiri(i8 %c) {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_iiri_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_iiri_param_0];
 ; CHECK-NEXT:    { // callseq 35, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, 2, %rs1, 4};
@@ -843,7 +843,7 @@ define void @st_param_v4_i8_irii(i8 %b) {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_irii_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irii_param_0];
 ; CHECK-NEXT:    { // callseq 36, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs1, 3, 4};
@@ -867,7 +867,7 @@ define void @st_param_v4_i8_riii(i8 %a) {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [st_param_v4_i8_riii_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_riii_param_0];
 ; CHECK-NEXT:    { // callseq 37, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, 2, 3, 4};
@@ -911,9 +911,9 @@ define void @st_param_v4_i16_irrr(i16 %b, i16 %c, i16 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_irrr_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_irrr_param_1];
-; CHECK-NEXT:    ld.param.u16 %rs3, [st_param_v4_i16_irrr_param_2];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_irrr_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [st_param_v4_i16_irrr_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [st_param_v4_i16_irrr_param_2];
 ; CHECK-NEXT:    { // callseq 39, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, %rs1, %rs2, %rs3};
@@ -937,9 +937,9 @@ define void @st_param_v4_i16_rirr(i16 %a, i16 %c, i16 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_rirr_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_rirr_param_1];
-; CHECK-NEXT:    ld.param.u16 %rs3, [st_param_v4_i16_rirr_param_2];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_rirr_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [st_param_v4_i16_rirr_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [st_param_v4_i16_rirr_param_2];
 ; CHECK-NEXT:    { // callseq 40, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, 2, %rs2, %rs3};
@@ -963,9 +963,9 @@ define void @st_param_v4_i16_rrir(i16 %a, i16 %b, i16 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_rrir_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_rrir_param_1];
-; CHECK-NEXT:    ld.param.u16 %rs3, [st_param_v4_i16_rrir_param_2];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_rrir_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [st_param_v4_i16_rrir_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [st_param_v4_i16_rrir_param_2];
 ; CHECK-NEXT:    { // callseq 41, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, %rs2, 3, %rs3};
@@ -989,9 +989,9 @@ define void @st_param_v4_i16_rrri(i16 %a, i16 %b, i16 %c) {
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_rrri_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_rrri_param_1];
-; CHECK-NEXT:    ld.param.u16 %rs3, [st_param_v4_i16_rrri_param_2];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_rrri_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [st_param_v4_i16_rrri_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [st_param_v4_i16_rrri_param_2];
 ; CHECK-NEXT:    { // callseq 42, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, %rs2, %rs3, 4};
@@ -1015,8 +1015,8 @@ define void @st_param_v4_i16_iirr(i16 %c, i16 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_iirr_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_iirr_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_iirr_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [st_param_v4_i16_iirr_param_1];
 ; CHECK-NEXT:    { // callseq 43, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, 2, %rs1, %rs2};
@@ -1040,8 +1040,8 @@ define void @st_param_v4_i16_irir(i16 %b, i16 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_irir_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_irir_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_irir_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [st_param_v4_i16_irir_param_1];
 ; CHECK-NEXT:    { // callseq 44, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, %rs1, 3, %rs2};
@@ -1065,8 +1065,8 @@ define void @st_param_v4_i16_irri(i16 %b, i16 %c) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_irri_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_irri_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_irri_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [st_param_v4_i16_irri_param_1];
 ; CHECK-NEXT:    { // callseq 45, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, %rs1, %rs2, 4};
@@ -1090,8 +1090,8 @@ define void @st_param_v4_i16_riir(i16 %a, i16 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_riir_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_riir_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_riir_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [st_param_v4_i16_riir_param_1];
 ; CHECK-NEXT:    { // callseq 46, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, 2, 3, %rs2};
@@ -1115,8 +1115,8 @@ define void @st_param_v4_i16_riri(i16 %a, i16 %c) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_riri_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_riri_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_riri_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [st_param_v4_i16_riri_param_1];
 ; CHECK-NEXT:    { // callseq 47, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, 2, %rs2, 4};
@@ -1140,8 +1140,8 @@ define void @st_param_v4_i16_rrii(i16 %a, i16 %b) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_rrii_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [st_param_v4_i16_rrii_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_rrii_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [st_param_v4_i16_rrii_param_1];
 ; CHECK-NEXT:    { // callseq 48, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, %rs2, 3, 4};
@@ -1165,7 +1165,7 @@ define void @st_param_v4_i16_iiir(i16 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_iiir_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_iiir_param_0];
 ; CHECK-NEXT:    { // callseq 49, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, 2, 3, %rs1};
@@ -1189,7 +1189,7 @@ define void @st_param_v4_i16_iiri(i16 %c) {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_iiri_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_iiri_param_0];
 ; CHECK-NEXT:    { // callseq 50, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, 2, %rs1, 4};
@@ -1213,7 +1213,7 @@ define void @st_param_v4_i16_irii(i16 %b) {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_irii_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_irii_param_0];
 ; CHECK-NEXT:    { // callseq 51, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {1, %rs1, 3, 4};
@@ -1237,7 +1237,7 @@ define void @st_param_v4_i16_riii(i16 %a) {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [st_param_v4_i16_riii_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [st_param_v4_i16_riii_param_0];
 ; CHECK-NEXT:    { // callseq 52, 0
 ; CHECK-NEXT:    .param .align 8 .b8 param0[8];
 ; CHECK-NEXT:    st.param.v4.b16 [param0], {%rs1, 2, 3, 4};
@@ -1281,9 +1281,9 @@ define void @st_param_v4_i32_irrr(i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_irrr_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_irrr_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [st_param_v4_i32_irrr_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_irrr_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_i32_irrr_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [st_param_v4_i32_irrr_param_2];
 ; CHECK-NEXT:    { // callseq 54, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, %r1, %r2, %r3};
@@ -1307,9 +1307,9 @@ define void @st_param_v4_i32_rirr(i32 %a, i32 %c, i32 %d) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_rirr_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_rirr_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [st_param_v4_i32_rirr_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_rirr_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_i32_rirr_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [st_param_v4_i32_rirr_param_2];
 ; CHECK-NEXT:    { // callseq 55, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 2, %r2, %r3};
@@ -1333,9 +1333,9 @@ define void @st_param_v4_i32_rrir(i32 %a, i32 %b, i32 %d) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_rrir_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_rrir_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [st_param_v4_i32_rrir_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_rrir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_i32_rrir_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [st_param_v4_i32_rrir_param_2];
 ; CHECK-NEXT:    { // callseq 56, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, %r2, 3, %r3};
@@ -1359,9 +1359,9 @@ define void @st_param_v4_i32_rrri(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_rrri_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_rrri_param_1];
-; CHECK-NEXT:    ld.param.u32 %r3, [st_param_v4_i32_rrri_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_rrri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_i32_rrri_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [st_param_v4_i32_rrri_param_2];
 ; CHECK-NEXT:    { // callseq 57, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, %r2, %r3, 4};
@@ -1385,8 +1385,8 @@ define void @st_param_v4_i32_iirr(i32 %c, i32 %d) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_iirr_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_iirr_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_iirr_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_i32_iirr_param_1];
 ; CHECK-NEXT:    { // callseq 58, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, 2, %r1, %r2};
@@ -1410,8 +1410,8 @@ define void @st_param_v4_i32_irir(i32 %b, i32 %d) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_irir_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_irir_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_irir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_i32_irir_param_1];
 ; CHECK-NEXT:    { // callseq 59, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, %r1, 3, %r2};
@@ -1435,8 +1435,8 @@ define void @st_param_v4_i32_irri(i32 %b, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_irri_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_irri_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_irri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_i32_irri_param_1];
 ; CHECK-NEXT:    { // callseq 60, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, %r1, %r2, 4};
@@ -1460,8 +1460,8 @@ define void @st_param_v4_i32_riir(i32 %a, i32 %d) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_riir_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_riir_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_riir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_i32_riir_param_1];
 ; CHECK-NEXT:    { // callseq 61, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 2, 3, %r2};
@@ -1485,8 +1485,8 @@ define void @st_param_v4_i32_riri(i32 %a, i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_riri_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_riri_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_riri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_i32_riri_param_1];
 ; CHECK-NEXT:    { // callseq 62, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 2, %r2, 4};
@@ -1510,8 +1510,8 @@ define void @st_param_v4_i32_rrii(i32 %a, i32 %b) {
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_rrii_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [st_param_v4_i32_rrii_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_rrii_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [st_param_v4_i32_rrii_param_1];
 ; CHECK-NEXT:    { // callseq 63, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, %r2, 3, 4};
@@ -1535,7 +1535,7 @@ define void @st_param_v4_i32_iiir(i32 %d) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_iiir_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_iiir_param_0];
 ; CHECK-NEXT:    { // callseq 64, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, 2, 3, %r1};
@@ -1559,7 +1559,7 @@ define void @st_param_v4_i32_iiri(i32 %c) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_iiri_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_iiri_param_0];
 ; CHECK-NEXT:    { // callseq 65, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, 2, %r1, 4};
@@ -1583,7 +1583,7 @@ define void @st_param_v4_i32_irii(i32 %b) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_irii_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_irii_param_0];
 ; CHECK-NEXT:    { // callseq 66, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {1, %r1, 3, 4};
@@ -1607,7 +1607,7 @@ define void @st_param_v4_i32_riii(i32 %a) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [st_param_v4_i32_riii_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [st_param_v4_i32_riii_param_0];
 ; CHECK-NEXT:    { // callseq 67, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
 ; CHECK-NEXT:    st.param.v4.b32 [param0], {%r1, 2, 3, 4};
@@ -1634,7 +1634,7 @@ define void @st_param_v4_f32_iiii() {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    { // callseq 68, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {0f3F800000, 0f40000000, 0f40400000, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, 0f40400000, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1651,12 +1651,12 @@ define void @st_param_v4_f32_irrr(float %b, float %c, float %d) {
 ; CHECK-NEXT:    .reg .b32 %f<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_irrr_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_irrr_param_1];
-; CHECK-NEXT:    ld.param.f32 %f3, [st_param_v4_f32_irrr_param_2];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_irrr_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_irrr_param_1];
+; CHECK-NEXT:    ld.param.b32 %f3, [st_param_v4_f32_irrr_param_2];
 ; CHECK-NEXT:    { // callseq 69, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {0f3F800000, %f1, %f2, %f3};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %f1, %f2, %f3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1677,12 +1677,12 @@ define void @st_param_v4_f32_rirr(float %a, float %c, float %d) {
 ; CHECK-NEXT:    .reg .b32 %f<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_rirr_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_rirr_param_1];
-; CHECK-NEXT:    ld.param.f32 %f3, [st_param_v4_f32_rirr_param_2];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_rirr_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_rirr_param_1];
+; CHECK-NEXT:    ld.param.b32 %f3, [st_param_v4_f32_rirr_param_2];
 ; CHECK-NEXT:    { // callseq 70, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {%f1, 0f40000000, %f2, %f3};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, 0f40000000, %f2, %f3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1703,12 +1703,12 @@ define void @st_param_v4_f32_rrir(float %a, float %b, float %d) {
 ; CHECK-NEXT:    .reg .b32 %f<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_rrir_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_rrir_param_1];
-; CHECK-NEXT:    ld.param.f32 %f3, [st_param_v4_f32_rrir_param_2];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_rrir_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_rrir_param_1];
+; CHECK-NEXT:    ld.param.b32 %f3, [st_param_v4_f32_rrir_param_2];
 ; CHECK-NEXT:    { // callseq 71, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {%f1, %f2, 0f40400000, %f3};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, %f2, 0f40400000, %f3};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1729,12 +1729,12 @@ define void @st_param_v4_f32_rrri(float %a, float %b, float %c) {
 ; CHECK-NEXT:    .reg .b32 %f<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_rrri_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_rrri_param_1];
-; CHECK-NEXT:    ld.param.f32 %f3, [st_param_v4_f32_rrri_param_2];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_rrri_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_rrri_param_1];
+; CHECK-NEXT:    ld.param.b32 %f3, [st_param_v4_f32_rrri_param_2];
 ; CHECK-NEXT:    { // callseq 72, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {%f1, %f2, %f3, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, %f2, %f3, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1755,11 +1755,11 @@ define void @st_param_v4_f32_iirr(float %c, float %d) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_iirr_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_iirr_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_iirr_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_iirr_param_1];
 ; CHECK-NEXT:    { // callseq 73, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {0f3F800000, 0f40000000, %f1, %f2};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, %f1, %f2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1780,11 +1780,11 @@ define void @st_param_v4_f32_irir(float %b, float %d) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_irir_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_irir_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_irir_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_irir_param_1];
 ; CHECK-NEXT:    { // callseq 74, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {0f3F800000, %f1, 0f40400000, %f2};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %f1, 0f40400000, %f2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1805,11 +1805,11 @@ define void @st_param_v4_f32_irri(float %b, float %c) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_irri_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_irri_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_irri_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_irri_param_1];
 ; CHECK-NEXT:    { // callseq 75, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {0f3F800000, %f1, %f2, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %f1, %f2, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1830,11 +1830,11 @@ define void @st_param_v4_f32_riir(float %a, float %d) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_riir_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_riir_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_riir_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_riir_param_1];
 ; CHECK-NEXT:    { // callseq 76, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {%f1, 0f40000000, 0f40400000, %f2};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, 0f40000000, 0f40400000, %f2};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1855,11 +1855,11 @@ define void @st_param_v4_f32_riri(float %a, float %c) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_riri_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_riri_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_riri_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_riri_param_1];
 ; CHECK-NEXT:    { // callseq 77, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {%f1, 0f40000000, %f2, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, 0f40000000, %f2, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1880,11 +1880,11 @@ define void @st_param_v4_f32_rrii(float %a, float %b) {
 ; CHECK-NEXT:    .reg .b32 %f<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_rrii_param_0];
-; CHECK-NEXT:    ld.param.f32 %f2, [st_param_v4_f32_rrii_param_1];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_rrii_param_0];
+; CHECK-NEXT:    ld.param.b32 %f2, [st_param_v4_f32_rrii_param_1];
 ; CHECK-NEXT:    { // callseq 78, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {%f1, %f2, 0f40400000, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, %f2, 0f40400000, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1905,10 +1905,10 @@ define void @st_param_v4_f32_iiir(float %d) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_iiir_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_iiir_param_0];
 ; CHECK-NEXT:    { // callseq 79, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {0f3F800000, 0f40000000, 0f40400000, %f1};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, 0f40400000, %f1};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1929,10 +1929,10 @@ define void @st_param_v4_f32_iiri(float %c) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_iiri_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_iiri_param_0];
 ; CHECK-NEXT:    { // callseq 80, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {0f3F800000, 0f40000000, %f1, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, 0f40000000, %f1, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1953,10 +1953,10 @@ define void @st_param_v4_f32_irii(float %b) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_irii_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_irii_param_0];
 ; CHECK-NEXT:    { // callseq 81, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {0f3F800000, %f1, 0f40400000, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {0f3F800000, %f1, 0f40400000, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
@@ -1977,10 +1977,10 @@ define void @st_param_v4_f32_riii(float %a) {
 ; CHECK-NEXT:    .reg .b32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.f32 %f1, [st_param_v4_f32_riii_param_0];
+; CHECK-NEXT:    ld.param.b32 %f1, [st_param_v4_f32_riii_param_0];
 ; CHECK-NEXT:    { // callseq 82, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[16];
-; CHECK-NEXT:    st.param.v4.f32 [param0], {%f1, 0f40000000, 0f40400000, 0f40800000};
+; CHECK-NEXT:    st.param.v4.b32 [param0], {%f1, 0f40000000, 0f40400000, 0f40800000};
 ; CHECK-NEXT:    call.uni
 ; CHECK-NEXT:    call_v4_f32,
 ; CHECK-NEXT:    (
diff --git a/llvm/test/CodeGen/NVPTX/st_bulk.ll b/llvm/test/CodeGen/NVPTX/st_bulk.ll
index 785f78a6f9519..944f221fb1af0 100644
--- a/llvm/test/CodeGen/NVPTX/st_bulk.ll
+++ b/llvm/test/CodeGen/NVPTX/st_bulk.ll
@@ -11,8 +11,8 @@ define void @st_bulk(ptr %dest_addr, i64 %size) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [st_bulk_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [st_bulk_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [st_bulk_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [st_bulk_param_1];
 ; CHECK-NEXT:    st.bulk [%rd1], %rd2, 0;
 ; CHECK-NEXT:    ret;
   call void @llvm.nvvm.st.bulk(ptr %dest_addr, i64 %size, i64 0)
@@ -26,8 +26,8 @@ define void @st_bulk_shared_cta(ptr addrspace(3) %dest_addr, i64 %size) {
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-PTX64-EMPTY:
 ; CHECK-PTX64-NEXT:  // %bb.0:
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [st_bulk_shared_cta_param_0];
-; CHECK-PTX64-NEXT:    ld.param.u64 %rd2, [st_bulk_shared_cta_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [st_bulk_shared_cta_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [st_bulk_shared_cta_param_1];
 ; CHECK-PTX64-NEXT:    st.bulk.shared::cta [%rd1], %rd2, 0;
 ; CHECK-PTX64-NEXT:    ret;
 ;
@@ -37,8 +37,8 @@ define void @st_bulk_shared_cta(ptr addrspace(3) %dest_addr, i64 %size) {
 ; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-PTX-SHARED32-EMPTY:
 ; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u32 %r1, [st_bulk_shared_cta_param_0];
-; CHECK-PTX-SHARED32-NEXT:    ld.param.u64 %rd1, [st_bulk_shared_cta_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [st_bulk_shared_cta_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [st_bulk_shared_cta_param_1];
 ; CHECK-PTX-SHARED32-NEXT:    st.bulk.shared::cta [%r1], %rd1, 0;
 ; CHECK-PTX-SHARED32-NEXT:    ret;
    call void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3) %dest_addr, i64 %size, i64 0)
diff --git a/llvm/test/CodeGen/NVPTX/stacksaverestore.ll b/llvm/test/CodeGen/NVPTX/stacksaverestore.ll
index e9169c942b8bc..802ae26da41a8 100644
--- a/llvm/test/CodeGen/NVPTX/stacksaverestore.ll
+++ b/llvm/test/CodeGen/NVPTX/stacksaverestore.ll
@@ -49,7 +49,7 @@ define void @test_restore(ptr %p) {
 ; CHECK-32-NEXT:    .reg .b32 %r<3>;
 ; CHECK-32-EMPTY:
 ; CHECK-32-NEXT:  // %bb.0:
-; CHECK-32-NEXT:    ld.param.u32 %r1, [test_restore_param_0];
+; CHECK-32-NEXT:    ld.param.b32 %r1, [test_restore_param_0];
 ; CHECK-32-NEXT:    cvta.to.local.u32 %r2, %r1;
 ; CHECK-32-NEXT:    stackrestore.u32 %r2;
 ; CHECK-32-NEXT:    ret;
@@ -59,7 +59,7 @@ define void @test_restore(ptr %p) {
 ; CHECK-64-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-64-EMPTY:
 ; CHECK-64-NEXT:  // %bb.0:
-; CHECK-64-NEXT:    ld.param.u64 %rd1, [test_restore_param_0];
+; CHECK-64-NEXT:    ld.param.b64 %rd1, [test_restore_param_0];
 ; CHECK-64-NEXT:    cvta.to.local.u64 %rd2, %rd1;
 ; CHECK-64-NEXT:    stackrestore.u64 %rd2;
 ; CHECK-64-NEXT:    ret;
@@ -70,7 +70,7 @@ define void @test_restore(ptr %p) {
 ; CHECK-MIXED-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-MIXED-EMPTY:
 ; CHECK-MIXED-NEXT:  // %bb.0:
-; CHECK-MIXED-NEXT:    ld.param.u64 %rd1, [test_restore_param_0];
+; CHECK-MIXED-NEXT:    ld.param.b64 %rd1, [test_restore_param_0];
 ; CHECK-MIXED-NEXT:    cvta.to.local.u64 %rd2, %rd1;
 ; CHECK-MIXED-NEXT:    cvt.u32.u64 %r1, %rd2;
 ; CHECK-MIXED-NEXT:    stackrestore.u32 %r1;
diff --git a/llvm/test/CodeGen/NVPTX/store-retval.ll b/llvm/test/CodeGen/NVPTX/store-retval.ll
index 6a60c97b854b7..3bb7c83815088 100644
--- a/llvm/test/CodeGen/NVPTX/store-retval.ll
+++ b/llvm/test/CodeGen/NVPTX/store-retval.ll
@@ -23,7 +23,7 @@
 
 define %struct.StNoalign @func_StNoalign(ptr nocapture noundef readonly byval(%struct.StNoalign) align 4 %in) {
   ; CHECK-LABEL: .func{{.*}}func_StNoalign
-  ; CHECK:       ld.param.u32    [[R1:%r[0-9]+]],   [func_StNoalign_param_0];
+  ; CHECK:       ld.param.b32    [[R1:%r[0-9]+]],   [func_StNoalign_param_0];
   ; CHECK-NOT:   st.param.b32    [func_retval0+0],  %r{{[0-9]+}};
   ; CHECK-NOT:   st.param.b32    [func_retval0+4],  %r{{[0-9]+}};
   ; CHECK-NOT:   st.param.b32    [func_retval0+8],  %r{{[0-9]+}};
@@ -39,7 +39,7 @@ define %struct.StNoalign @func_StNoalign(ptr nocapture noundef readonly byval(%s
 
 define %struct.StAlign8 @func_StAlign8(ptr nocapture noundef readonly byval(%struct.StAlign8) align 8 %in) {
   ; CHECK-LABEL: .func{{.*}}func_StAlign8
-  ; CHECK:       ld.param.u32    [[R1:%r[0-9]+]],   [func_StAlign8_param_0];
+  ; CHECK:       ld.param.b32    [[R1:%r[0-9]+]],   [func_StAlign8_param_0];
   ; CHECK-NOT:   st.param.b32    [func_retval0+0],  %r{{[0-9]+}};
   ; CHECK-NOT:   st.param.b32    [func_retval0+4],  %r{{[0-9]+}};
   ; CHECK-NOT:   st.param.b32    [func_retval0+8],  %r{{[0-9]+}};
@@ -56,7 +56,7 @@ define %struct.StAlign8 @func_StAlign8(ptr nocapture noundef readonly byval(%str
 
 define %struct.StAlign16 @func_StAlign16(ptr nocapture noundef readonly byval(%struct.StAlign16) align 16 %in) {
   ; CHECK-LABEL: .func{{.*}}func_StAlign16
-  ; CHECK:       ld.param.u32    [[R1:%r[0-9]+]],   [func_StAlign16_param_0];
+  ; CHECK:       ld.param.b32    [[R1:%r[0-9]+]],   [func_StAlign16_param_0];
   ; CHECK-NOT:   st.param.b32    [func_retval0+0],  %r{{[0-9]+}};
   ; CHECK-NOT:   st.param.b32    [func_retval0+4],  %r{{[0-9]+}};
   ; CHECK-NOT:   st.param.b32    [func_retval0+8],  %r{{[0-9]+}};
diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll
index d871331cf1eca..52415b05e03d0 100644
--- a/llvm/test/CodeGen/NVPTX/store-undef.ll
+++ b/llvm/test/CodeGen/NVPTX/store-undef.ll
@@ -34,8 +34,8 @@ define void @test_store_param_def(i64 %param0, i32 %param1) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_param_def_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_store_param_def_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_store_param_def_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_store_param_def_param_1];
 ; CHECK-NEXT:    { // callseq 1, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[32];
 ; CHECK-NEXT:    st.param.b64 [param0], %rd1;
@@ -75,12 +75,12 @@ define void @test_store_def(i64 %param0, i32 %param1, ptr %out) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_def_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [test_store_def_param_1];
-; CHECK-NEXT:    ld.param.u64 %rd2, [test_store_def_param_2];
-; CHECK-NEXT:    st.v4.u32 [%rd2+16], {%r2, %r1, %r3, %r4};
-; CHECK-NEXT:    st.v2.u32 [%rd2+8], {%r5, %r1};
-; CHECK-NEXT:    st.u64 [%rd2], %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_store_def_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_store_def_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_store_def_param_2];
+; CHECK-NEXT:    st.v4.b32 [%rd2+16], {%r2, %r1, %r3, %r4};
+; CHECK-NEXT:    st.v2.b32 [%rd2+8], {%r5, %r1};
+; CHECK-NEXT:    st.b64 [%rd2], %rd1;
 ; CHECK-NEXT:    ret;
   %V2 = insertelement <2 x i32> undef, i32 %param1, i32 1
   %V4 = insertelement <4 x i32> undef, i32 %param1, i32 1
@@ -98,16 +98,16 @@ define void @test_store_volatile_undef(ptr %out, <8 x i32> %vec) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_volatile_undef_param_0];
-; CHECK-NEXT:    st.volatile.v4.u32 [%rd1+16], {%r1, %r2, %r3, %r4};
-; CHECK-NEXT:    st.volatile.v2.u32 [%rd1+8], {%r5, %r6};
-; CHECK-NEXT:    st.volatile.u64 [%rd1], %rd2;
-; CHECK-NEXT:    ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_store_volatile_undef_param_1];
-; CHECK-NEXT:    ld.param.v4.u32 {%r11, %r12, %r13, %r14}, [test_store_volatile_undef_param_1+16];
-; CHECK-NEXT:    st.volatile.v4.u32 [%rd3], {%r11, %r12, %r13, %r14};
-; CHECK-NEXT:    st.volatile.v4.u32 [%rd4], {%r7, %r8, %r9, %r10};
-; CHECK-NEXT:    st.volatile.v4.u32 [%rd1+16], {%r15, %r16, %r17, %r18};
-; CHECK-NEXT:    st.volatile.v4.u32 [%rd1], {%r19, %r20, %r21, %r22};
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_store_volatile_undef_param_0];
+; CHECK-NEXT:    st.volatile.v4.b32 [%rd1+16], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    st.volatile.v2.b32 [%rd1+8], {%r5, %r6};
+; CHECK-NEXT:    st.volatile.b64 [%rd1], %rd2;
+; CHECK-NEXT:    ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [test_store_volatile_undef_param_1];
+; CHECK-NEXT:    ld.param.v4.b32 {%r11, %r12, %r13, %r14}, [test_store_volatile_undef_param_1+16];
+; CHECK-NEXT:    st.volatile.v4.b32 [%rd3], {%r11, %r12, %r13, %r14};
+; CHECK-NEXT:    st.volatile.v4.b32 [%rd4], {%r7, %r8, %r9, %r10};
+; CHECK-NEXT:    st.volatile.v4.b32 [%rd1+16], {%r15, %r16, %r17, %r18};
+; CHECK-NEXT:    st.volatile.v4.b32 [%rd1], {%r19, %r20, %r21, %r22};
 ; CHECK-NEXT:    ret;
   store volatile %struct.T undef, ptr %out
   store volatile <8 x i32> %vec, ptr undef
@@ -122,10 +122,10 @@ define void @test_store_volatile_of_poison(ptr %out) {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_volatile_of_poison_param_0];
-; CHECK-NEXT:    st.volatile.v4.u32 [%rd1+16], {%r1, %r2, %r3, %r4};
-; CHECK-NEXT:    st.volatile.v2.u32 [%rd1+8], {%r5, %r6};
-; CHECK-NEXT:    st.volatile.u64 [%rd1], %rd2;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_store_volatile_of_poison_param_0];
+; CHECK-NEXT:    st.volatile.v4.b32 [%rd1+16], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT:    st.volatile.v2.b32 [%rd1+8], {%r5, %r6};
+; CHECK-NEXT:    st.volatile.b64 [%rd1], %rd2;
 ; CHECK-NEXT:    ret;
   store volatile %struct.T poison, ptr %out
   ret void
@@ -138,12 +138,12 @@ define void @test_store_volatile_to_poison(%struct.T %param) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_store_volatile_to_poison_param_0];
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_store_volatile_to_poison_param_0+8];
-; CHECK-NEXT:    ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_store_volatile_to_poison_param_0+16];
-; CHECK-NEXT:    st.volatile.v4.u32 [%rd2], {%r3, %r4, %r5, %r6};
-; CHECK-NEXT:    st.volatile.v2.u32 [%rd3], {%r1, %r2};
-; CHECK-NEXT:    st.volatile.u64 [%rd4], %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_store_volatile_to_poison_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_store_volatile_to_poison_param_0+8];
+; CHECK-NEXT:    ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [test_store_volatile_to_poison_param_0+16];
+; CHECK-NEXT:    st.volatile.v4.b32 [%rd2], {%r3, %r4, %r5, %r6};
+; CHECK-NEXT:    st.volatile.v2.b32 [%rd3], {%r1, %r2};
+; CHECK-NEXT:    st.volatile.b64 [%rd4], %rd1;
 ; CHECK-NEXT:    ret;
   store volatile %struct.T %param, ptr poison
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
index 3afff3245fbf6..ae74bbb866eb2 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -18,13 +18,13 @@ define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [foo_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [foo_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [foo_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [foo_param_1];
 ; CHECK-NEXT:    cvta.to.global.u64 %rd3, %rd2;
-; CHECK-NEXT:    ld.param.u32 %r1, [foo_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [foo_param_2];
 ; CHECK-NEXT:    suld.b.1d.b32.trap {%r2}, [%rd1, {%r1}];
 ; CHECK-NEXT:    cvt.rn.f32.s32 %f1, %r2;
-; CHECK-NEXT:    st.global.f32 [%rd3], %f1;
+; CHECK-NEXT:    st.global.b32 [%rd3], %f1;
 ; CHECK-NEXT:    ret;
   %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
   %ret = sitofp i32 %val to float
@@ -42,12 +42,12 @@ define ptx_kernel void @bar(ptr %red, i32 %idx) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [bar_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [bar_param_0];
 ; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
-; CHECK-NEXT:    ld.param.u32 %r1, [bar_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [bar_param_1];
 ; CHECK-NEXT:    suld.b.1d.b32.trap {%r2}, [surf0, {%r1}];
 ; CHECK-NEXT:    cvt.rn.f32.s32 %f1, %r2;
-; CHECK-NEXT:    st.global.f32 [%rd2], %f1;
+; CHECK-NEXT:    st.global.b32 [%rd2], %f1;
 ; CHECK-NEXT:    ret;
   %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @surf0)
   %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %surfHandle, i32 %idx)
diff --git a/llvm/test/CodeGen/NVPTX/surf-read.ll b/llvm/test/CodeGen/NVPTX/surf-read.ll
index 3166622f613c8..8dee5250920e6 100644
--- a/llvm/test/CodeGen/NVPTX/surf-read.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-read.ll
@@ -12,7 +12,7 @@ define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
   %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
 ; CHECK: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
   %ret = sitofp i32 %val to float
-; CHECK: st.f32 [%rd{{[0-9]+}}], %f[[REDF]]
+; CHECK: st.b32 [%rd{{[0-9]+}}], %f[[REDF]]
   store float %ret, ptr %red
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
index 5dc44cb1925b0..abc2ea89b62cf 100644
--- a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll
@@ -17,9 +17,9 @@ define ptx_kernel void @foo(i64 %img, i32 %val, i32 %idx) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [foo_param_0];
-; CHECK-NEXT:    ld.param.u32 %r1, [foo_param_1];
-; CHECK-NEXT:    ld.param.u32 %r2, [foo_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd1, [foo_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [foo_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [foo_param_2];
 ; CHECK-NEXT:    sust.b.1d.b32.trap [%rd1, {%r2}], {%r1};
 ; CHECK-NEXT:    ret;
   tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val)
@@ -37,8 +37,8 @@ define ptx_kernel void @bar(i32 %val, i32 %idx) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [bar_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [bar_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [bar_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [bar_param_1];
 ; CHECK-NEXT:    sust.b.1d.b32.trap [surf0, {%r2}], {%r1};
 ; CHECK-NEXT:    ret;
   %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @surf0)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
index f80b5a5e16ea3..9c60af914fafd 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
@@ -17,8 +17,8 @@ define void @test_tcgen05_alloc(ptr %addr, i32 %ncols) {
 ; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.u64 %rd1, [test_tcgen05_alloc_param_0];
-; CHECK_PTX64-NEXT:    ld.param.u32 %r1, [test_tcgen05_alloc_param_1];
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_alloc_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_param_1];
 ; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::1.sync.aligned.b32 [%rd1], %r1;
 ; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1;
 ; CHECK_PTX64-NEXT:    ret;
@@ -29,8 +29,8 @@ define void @test_tcgen05_alloc(ptr %addr, i32 %ncols) {
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.u64 %rd1, [test_tcgen05_alloc_param_0];
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.u32 %r1, [test_tcgen05_alloc_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b64 %rd1, [test_tcgen05_alloc_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_param_1];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::1.sync.aligned.b32 [%rd1], %r1;
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1;
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
@@ -48,8 +48,8 @@ define void @test_tcgen05_alloc_shared(ptr addrspace(3) %addr, i32 %ncols) {
 ; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.u64 %rd1, [test_tcgen05_alloc_shared_param_0];
-; CHECK_PTX64-NEXT:    ld.param.u32 %r1, [test_tcgen05_alloc_shared_param_1];
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_alloc_shared_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_shared_param_1];
 ; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%rd1], %r1;
 ; CHECK_PTX64-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%rd1], %r1;
 ; CHECK_PTX64-NEXT:    ret;
@@ -59,8 +59,8 @@ define void @test_tcgen05_alloc_shared(ptr addrspace(3) %addr, i32 %ncols) {
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<3>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.u32 %r1, [test_tcgen05_alloc_shared_param_0];
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.u32 %r2, [test_tcgen05_alloc_shared_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_alloc_shared_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r2, [test_tcgen05_alloc_shared_param_1];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%r1], %r2;
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%r1], %r2;
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
@@ -80,8 +80,8 @@ define void @test_tcgen05_dealloc(ptr addrspace(6) %tmem_addr, i32 %ncols) {
 ; CHECK_PTX64-NEXT:    .reg .b32 %r<3>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.u32 %r1, [test_tcgen05_dealloc_param_0];
-; CHECK_PTX64-NEXT:    ld.param.u32 %r2, [test_tcgen05_dealloc_param_1];
+; CHECK_PTX64-NEXT:    ld.param.b32 %r1, [test_tcgen05_dealloc_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b32 %r2, [test_tcgen05_dealloc_param_1];
 ; CHECK_PTX64-NEXT:    tcgen05.dealloc.cta_group::1.sync.aligned.b32 %r1, %r2;
 ; CHECK_PTX64-NEXT:    tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2;
 ; CHECK_PTX64-NEXT:    ret;
@@ -91,8 +91,8 @@ define void @test_tcgen05_dealloc(ptr addrspace(6) %tmem_addr, i32 %ncols) {
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<3>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.u32 %r1, [test_tcgen05_dealloc_param_0];
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.u32 %r2, [test_tcgen05_dealloc_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_dealloc_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r2, [test_tcgen05_dealloc_param_1];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.dealloc.cta_group::1.sync.aligned.b32 %r1, %r2;
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2;
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
index 6e0ec6bcf4465..cc3b359d0624d 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
@@ -16,7 +16,7 @@ define void @test_tcgen05_commit(ptr %bar_addr) {
 ; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.u64 %rd1, [test_tcgen05_commit_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_param_0];
 ; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
 ; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
 ; CHECK_PTX64-NEXT:    ret;
@@ -26,7 +26,7 @@ define void @test_tcgen05_commit(ptr %bar_addr) {
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.u64 %rd1, [test_tcgen05_commit_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_param_0];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
@@ -44,7 +44,7 @@ define void @test_tcgen05_commit_shared(ptr addrspace(3) %bar_addr) {
 ; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.u64 %rd1, [test_tcgen05_commit_shared_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_shared_param_0];
 ; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
 ; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
 ; CHECK_PTX64-NEXT:    ret;
@@ -54,7 +54,7 @@ define void @test_tcgen05_commit_shared(ptr addrspace(3) %bar_addr) {
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<2>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.u32 %r1, [test_tcgen05_commit_shared_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_commit_shared_param_0];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%r1];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%r1];
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
@@ -78,8 +78,8 @@ define void @test_tcgen05_commit_mc(ptr %bar_addr, i16 %cta_mask) {
 ; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.u64 %rd1, [test_tcgen05_commit_mc_param_0];
-; CHECK_PTX64-NEXT:    ld.param.u16 %rs1, [test_tcgen05_commit_mc_param_1];
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_mc_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_param_1];
 ; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
 ; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
 ; CHECK_PTX64-NEXT:    ret;
@@ -90,8 +90,8 @@ define void @test_tcgen05_commit_mc(ptr %bar_addr, i16 %cta_mask) {
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.u64 %rd1, [test_tcgen05_commit_mc_param_0];
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.u16 %rs1, [test_tcgen05_commit_mc_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_mc_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_param_1];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
@@ -110,8 +110,8 @@ define void @test_tcgen05_commit_mc_shared(ptr addrspace(3) %bar_addr, i16 %cta_
 ; CHECK_PTX64-NEXT:    .reg .b64 %rd<2>;
 ; CHECK_PTX64-EMPTY:
 ; CHECK_PTX64-NEXT:  // %bb.0:
-; CHECK_PTX64-NEXT:    ld.param.u64 %rd1, [test_tcgen05_commit_mc_shared_param_0];
-; CHECK_PTX64-NEXT:    ld.param.u16 %rs1, [test_tcgen05_commit_mc_shared_param_1];
+; CHECK_PTX64-NEXT:    ld.param.b64 %rd1, [test_tcgen05_commit_mc_shared_param_0];
+; CHECK_PTX64-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_param_1];
 ; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
 ; CHECK_PTX64-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
 ; CHECK_PTX64-NEXT:    ret;
@@ -122,8 +122,8 @@ define void @test_tcgen05_commit_mc_shared(ptr addrspace(3) %bar_addr, i16 %cta_
 ; CHECK_PTX64_SHARED32-NEXT:    .reg .b32 %r<2>;
 ; CHECK_PTX64_SHARED32-EMPTY:
 ; CHECK_PTX64_SHARED32-NEXT:  // %bb.0:
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.u32 %r1, [test_tcgen05_commit_mc_shared_param_0];
-; CHECK_PTX64_SHARED32-NEXT:    ld.param.u16 %rs1, [test_tcgen05_commit_mc_shared_param_1];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b32 %r1, [test_tcgen05_commit_mc_shared_param_0];
+; CHECK_PTX64_SHARED32-NEXT:    ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_param_1];
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1;
 ; CHECK_PTX64_SHARED32-NEXT:    tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1;
 ; CHECK_PTX64_SHARED32-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
index 50dc93325c286..780116c42380f 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
@@ -10,8 +10,8 @@ define void @test_tcgen05_cp_64x128_v1(ptr addrspace(6) %addr, i64 %sdesc) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -29,8 +29,8 @@ define void @test_tcgen05_cp_64x128_v2(ptr addrspace(6) %addr, i64 %sdesc) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -48,8 +48,8 @@ define void @test_tcgen05_cp_32x128(ptr addrspace(6) %addr, i64 %sdesc) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_32x128_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_32x128_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.32x128b.warpx4 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.32x128b.warpx4 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -68,8 +68,8 @@ define void @test_tcgen05_cp_128x128b(ptr addrspace(6) %addr, i64 %sdesc) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_128x128b_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x128b [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x128b [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -87,8 +87,8 @@ define void @test_tcgen05_cp_128x256b(ptr addrspace(6) %addr, i64 %sdesc) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_128x256b_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x256b [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x256b [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -106,8 +106,8 @@ define void @test_tcgen05_cp_4x256b(ptr addrspace(6) %addr, i64 %sdesc) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_4x256b_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.4x256b [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.4x256b [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -126,8 +126,8 @@ define void @test_tcgen05_cp_128x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sde
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -145,8 +145,8 @@ define void @test_tcgen05_cp_4x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -164,8 +164,8 @@ define void @test_tcgen05_cp_128x128b_b6x16_p32(ptr addrspace(6) %addr, i64 %sde
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -183,8 +183,8 @@ define void @test_tcgen05_cp_64x128_v1_b6x16_p32(ptr addrspace(6) %addr, i64 %sd
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -202,8 +202,8 @@ define void @test_tcgen05_cp_64x128_v2_b6x16_p32(ptr addrspace(6) %addr, i64 %sd
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -221,8 +221,8 @@ define void @test_tcgen05_cp_32x128_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -241,8 +241,8 @@ define void @test_tcgen05_cp_128x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sde
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -260,8 +260,8 @@ define void @test_tcgen05_cp_4x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -279,8 +279,8 @@ define void @test_tcgen05_cp_128x128b_b4x16_p64(ptr addrspace(6) %addr, i64 %sde
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -298,8 +298,8 @@ define void @test_tcgen05_cp_64x128_v1_b4x16_p64(ptr addrspace(6) %addr, i64 %sd
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -317,8 +317,8 @@ define void @test_tcgen05_cp_64x128_v2_b4x16_p64(ptr addrspace(6) %addr, i64 %sd
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
@@ -336,8 +336,8 @@ define void @test_tcgen05_cp_32x128_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_param_1];
 ; CHECK-NEXT:    tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
index 83dbcb1bc02b1..7e65338c4525d 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
@@ -11,7 +11,7 @@ define void @nvvm_tcgen05_ld_16x64b(ptr addrspace(6) %taddr) {
 ; CHECK-NEXT:    .reg .b32 %r<257>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_ld_16x64b_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_ld_16x64b_param_0];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x64b.x1.b32 {%r2}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x64b.x2.b32 {%r3, %r4}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x64b.x4.b32 {%r5, %r6, %r7, %r8}, [%r1];
@@ -46,7 +46,7 @@ define void @nvvm_tcgen05_ld_16x64b_pack(ptr addrspace(6) %taddr) {
 ; CHECK-NEXT:    .reg .b32 %r<257>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_ld_16x64b_pack_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_ld_16x64b_pack_param_0];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 {%r2}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 {%r3, %r4}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 {%r5, %r6, %r7, %r8}, [%r1];
@@ -81,7 +81,7 @@ define void @nvvm_tcgen05_ld_16x128b(ptr addrspace(6) %taddr) {
 ; CHECK-NEXT:    .reg .b32 %r<256>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_ld_16x128b_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_ld_16x128b_param_0];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x128b.x1.b32 {%r2, %r3}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x128b.x2.b32 {%r4, %r5, %r6, %r7}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x128b.x4.b32 {%r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15}, [%r1];
@@ -113,7 +113,7 @@ define void @nvvm_tcgen05_ld_16x128b_pack(ptr addrspace(6) %taddr) {
 ; CHECK-NEXT:    .reg .b32 %r<256>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_ld_16x128b_pack_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_ld_16x128b_pack_param_0];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 {%r2, %r3}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 {%r4, %r5, %r6, %r7}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 {%r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15}, [%r1];
@@ -145,7 +145,7 @@ define void @nvvm_tcgen05_ld_16x256b(ptr addrspace(6) %taddr) {
 ; CHECK-NEXT:    .reg .b32 %r<254>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_ld_16x256b_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_ld_16x256b_param_0];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x256b.x1.b32 {%r2, %r3, %r4, %r5}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x256b.x2.b32 {%r6, %r7, %r8, %r9, %r10, %r11, %r12, %r13}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x256b.x4.b32 {%r14, %r15, %r16, %r17, %r18, %r19, %r20, %r21, %r22, %r23, %r24, %r25, %r26, %r27, %r28, %r29}, [%r1];
@@ -174,7 +174,7 @@ define void @nvvm_tcgen05_ld_16x256b_pack(ptr addrspace(6) %taddr) {
 ; CHECK-NEXT:    .reg .b32 %r<254>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_ld_16x256b_pack_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_ld_16x256b_pack_param_0];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 {%r2, %r3, %r4, %r5}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 {%r6, %r7, %r8, %r9, %r10, %r11, %r12, %r13}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 {%r14, %r15, %r16, %r17, %r18, %r19, %r20, %r21, %r22, %r23, %r24, %r25, %r26, %r27, %r28, %r29}, [%r1];
@@ -203,7 +203,7 @@ define void @nvvm_tcgen05_ld_32x32b(ptr addrspace(6) %taddr) {
 ; CHECK-NEXT:    .reg .b32 %r<257>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_ld_32x32b_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_ld_32x32b_param_0];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.32x32b.x1.b32 {%r2}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.32x32b.x2.b32 {%r3, %r4}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.32x32b.x4.b32 {%r5, %r6, %r7, %r8}, [%r1];
@@ -237,7 +237,7 @@ define void @nvvm_tcgen05_ld_32x32b_pack(ptr addrspace(6) %taddr) {
 ; CHECK-NEXT:    .reg .b32 %r<257>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_ld_32x32b_pack_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_ld_32x32b_pack_param_0];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 {%r2}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 {%r3, %r4}, [%r1];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 {%r5, %r6, %r7, %r8}, [%r1];
@@ -272,7 +272,7 @@ define void @nvvm_tcgen05_ld_16x32bx2(ptr addrspace(6) %taddr) {
 ; CHECK-NEXT:    .reg .b32 %r<257>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_ld_16x32bx2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_ld_16x32bx2_param_0];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x32bx2.x1.b32 {%r2}, [%r1], 2;
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x32bx2.x2.b32 {%r3, %r4}, [%r1], 2;
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x32bx2.x4.b32 {%r5, %r6, %r7, %r8}, [%r1], 2;
@@ -306,7 +306,7 @@ define void @nvvm_tcgen05_ld_16x32bx2_pack(ptr addrspace(6) %taddr) {
 ; CHECK-NEXT:    .reg .b32 %r<257>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_ld_16x32bx2_pack_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_ld_16x32bx2_pack_param_0];
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 {%r2}, [%r1], 2;
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 {%r3, %r4}, [%r1], 2;
 ; CHECK-NEXT:    tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 {%r5, %r6, %r7, %r8}, [%r1], 2;
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
index 13a45b9d86dcf..590d75533bb8b 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
@@ -12,7 +12,7 @@ define void @test_tcgen05_shift(ptr addrspace(6) %tmem_addr) {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [test_tcgen05_shift_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_tcgen05_shift_param_0];
 ; CHECK-NEXT:    tcgen05.shift.cta_group::1.down [%r1];
 ; CHECK-NEXT:    tcgen05.shift.cta_group::2.down [%r1];
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
index c22f795193c7d..c323a54d75d7f 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
@@ -11,79 +11,79 @@ define void @nvvm_tcgen05_st_16x64b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i32
 ; CHECK-NEXT:    .reg .b32 %r<257>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_st_16x64b_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [nvvm_tcgen05_st_16x64b_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_st_16x64b_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [nvvm_tcgen05_st_16x64b_param_1];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x1.b32 [%r1], {%r2};
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [nvvm_tcgen05_st_16x64b_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [nvvm_tcgen05_st_16x64b_param_2];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x2.b32 [%r1], {%r3, %r4};
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x64b_param_3];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x64b_param_3];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x4.b32 [%r1], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT:    ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x64b_param_4+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x64b_param_4];
+; CHECK-NEXT:    ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x64b_param_4+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x64b_param_4];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x8.b32 [%r1], {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12};
-; CHECK-NEXT:    ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x64b_param_5+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x64b_param_5+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x64b_param_5+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x64b_param_5];
+; CHECK-NEXT:    ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x64b_param_5+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x64b_param_5+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x64b_param_5+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x64b_param_5];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x16.b32 [%r1], {%r29, %r30, %r31, %r32, %r25, %r26, %r27, %r28, %r21, %r22, %r23, %r24, %r17, %r18, %r19, %r20};
-; CHECK-NEXT:    ld.param.v4.u32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x64b_param_6+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x64b_param_6+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x64b_param_6+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x64b_param_6+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x64b_param_6+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x64b_param_6+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x64b_param_6+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x64b_param_6];
+; CHECK-NEXT:    ld.param.v4.b32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x64b_param_6+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x64b_param_6+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x64b_param_6+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x64b_param_6+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x64b_param_6+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x64b_param_6+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x64b_param_6+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x64b_param_6];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x32.b32 [%r1], {%r61, %r62, %r63, %r64, %r57, %r58, %r59, %r60, %r53, %r54, %r55, %r56, %r49, %r50, %r51, %r52, %r45, %r46, %r47, %r48, %r41, %r42, %r43, %r44, %r37, %r38, %r39, %r40, %r33, %r34, %r35, %r36};
-; CHECK-NEXT:    ld.param.v4.u32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x64b_param_7+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x64b_param_7+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x64b_param_7+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x64b_param_7+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x64b_param_7+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x64b_param_7+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x64b_param_7+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x64b_param_7+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x64b_param_7+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x64b_param_7+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x64b_param_7+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x64b_param_7+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x64b_param_7+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x64b_param_7+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x64b_param_7+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x64b_param_7];
+; CHECK-NEXT:    ld.param.v4.b32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x64b_param_7+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x64b_param_7+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x64b_param_7+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x64b_param_7+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x64b_param_7+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x64b_param_7+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x64b_param_7+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x64b_param_7+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x64b_param_7+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x64b_param_7+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x64b_param_7+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x64b_param_7+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x64b_param_7+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x64b_param_7+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x64b_param_7+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x64b_param_7];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x64.b32 [%r1], {%r125, %r126, %r127, %r128, %r121, %r122, %r123, %r124, %r117, %r118, %r119, %r120, %r113, %r114, %r115, %r116, %r109, %r110, %r111, %r112, %r105, %r106, %r107, %r108, %r101, %r102, %r103, %r104, %r97, %r98, %r99, %r100, %r93, %r94, %r95, %r96, %r89, %r90, %r91, %r92, %r85, %r86, %r87, %r88, %r81, %r82, %r83, %r84, %r77, %r78, %r79, %r80, %r73, %r74, %r75, %r76, %r69, %r70, %r71, %r72, %r65, %r66, %r67, %r68};
-; CHECK-NEXT:    ld.param.v4.u32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x64b_param_8+496];
-; CHECK-NEXT:    ld.param.v4.u32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x64b_param_8+480];
-; CHECK-NEXT:    ld.param.v4.u32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x64b_param_8+464];
-; CHECK-NEXT:    ld.param.v4.u32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x64b_param_8+448];
-; CHECK-NEXT:    ld.param.v4.u32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x64b_param_8+432];
-; CHECK-NEXT:    ld.param.v4.u32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x64b_param_8+416];
-; CHECK-NEXT:    ld.param.v4.u32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x64b_param_8+400];
-; CHECK-NEXT:    ld.param.v4.u32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x64b_param_8+384];
-; CHECK-NEXT:    ld.param.v4.u32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x64b_param_8+368];
-; CHECK-NEXT:    ld.param.v4.u32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x64b_param_8+352];
-; CHECK-NEXT:    ld.param.v4.u32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x64b_param_8+336];
-; CHECK-NEXT:    ld.param.v4.u32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x64b_param_8+320];
-; CHECK-NEXT:    ld.param.v4.u32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x64b_param_8+304];
-; CHECK-NEXT:    ld.param.v4.u32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x64b_param_8+288];
-; CHECK-NEXT:    ld.param.v4.u32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x64b_param_8+272];
-; CHECK-NEXT:    ld.param.v4.u32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x64b_param_8+256];
-; CHECK-NEXT:    ld.param.v4.u32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x64b_param_8+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x64b_param_8+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x64b_param_8+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x64b_param_8+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x64b_param_8+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x64b_param_8+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x64b_param_8+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x64b_param_8+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x64b_param_8+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x64b_param_8+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x64b_param_8+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x64b_param_8+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x64b_param_8+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x64b_param_8+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x64b_param_8+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x64b_param_8];
+; CHECK-NEXT:    ld.param.v4.b32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x64b_param_8+496];
+; CHECK-NEXT:    ld.param.v4.b32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x64b_param_8+480];
+; CHECK-NEXT:    ld.param.v4.b32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x64b_param_8+464];
+; CHECK-NEXT:    ld.param.v4.b32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x64b_param_8+448];
+; CHECK-NEXT:    ld.param.v4.b32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x64b_param_8+432];
+; CHECK-NEXT:    ld.param.v4.b32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x64b_param_8+416];
+; CHECK-NEXT:    ld.param.v4.b32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x64b_param_8+400];
+; CHECK-NEXT:    ld.param.v4.b32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x64b_param_8+384];
+; CHECK-NEXT:    ld.param.v4.b32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x64b_param_8+368];
+; CHECK-NEXT:    ld.param.v4.b32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x64b_param_8+352];
+; CHECK-NEXT:    ld.param.v4.b32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x64b_param_8+336];
+; CHECK-NEXT:    ld.param.v4.b32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x64b_param_8+320];
+; CHECK-NEXT:    ld.param.v4.b32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x64b_param_8+304];
+; CHECK-NEXT:    ld.param.v4.b32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x64b_param_8+288];
+; CHECK-NEXT:    ld.param.v4.b32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x64b_param_8+272];
+; CHECK-NEXT:    ld.param.v4.b32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x64b_param_8+256];
+; CHECK-NEXT:    ld.param.v4.b32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x64b_param_8+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x64b_param_8+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x64b_param_8+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x64b_param_8+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x64b_param_8+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x64b_param_8+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x64b_param_8+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x64b_param_8+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x64b_param_8+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x64b_param_8+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x64b_param_8+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x64b_param_8+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x64b_param_8+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x64b_param_8+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x64b_param_8+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x64b_param_8];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x128.b32 [%r1], {%r253, %r254, %r255, %r256, %r249, %r250, %r251, %r252, %r245, %r246, %r247, %r248, %r241, %r242, %r243, %r244, %r237, %r238, %r239, %r240, %r233, %r234, %r235, %r236, %r229, %r230, %r231, %r232, %r225, %r226, %r227, %r228, %r221, %r222, %r223, %r224, %r217, %r218, %r219, %r220, %r213, %r214, %r215, %r216, %r209, %r210, %r211, %r212, %r205, %r206, %r207, %r208, %r201, %r202, %r203, %r204, %r197, %r198, %r199, %r200, %r193, %r194, %r195, %r196, %r189, %r190, %r191, %r192, %r185, %r186, %r187, %r188, %r181, %r182, %r183, %r184, %r177, %r178, %r179, %r180, %r173, %r174, %r175, %r176, %r169, %r170, %r171, %r172, %r165, %r166, %r167, %r168, %r161, %r162, %r163, %r164, %r157, %r158, %r159, %r160, %r153, %r154, %r155, %r156, %r149, %r150, %r151, %r152, %r145, %r146, %r147, %r148, %r141, %r142, %r143, %r144, %r137, %r138, %r139, %r140, %r133, %r134, %r135, %r136, %r129, %r130, %r131, %r132};
 ; CHECK-NEXT:    ret;
   tail call void @llvm.nvvm.tcgen05.st.16x64b.x1(ptr addrspace(6) %taddr, i32 %stv1, i1 0)
@@ -111,79 +111,79 @@ define void @nvvm_tcgen05_st_16x64b_unpack(ptr addrspace(6) %taddr, i32 %stv1, <
 ; CHECK-NEXT:    .reg .b32 %r<257>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_st_16x64b_unpack_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [nvvm_tcgen05_st_16x64b_unpack_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_st_16x64b_unpack_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [nvvm_tcgen05_st_16x64b_unpack_param_1];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [%r1], {%r2};
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [nvvm_tcgen05_st_16x64b_unpack_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [nvvm_tcgen05_st_16x64b_unpack_param_2];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [%r1], {%r3, %r4};
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x64b_unpack_param_3];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x64b_unpack_param_3];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [%r1], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT:    ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x64b_unpack_param_4+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x64b_unpack_param_4];
+; CHECK-NEXT:    ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x64b_unpack_param_4+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x64b_unpack_param_4];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [%r1], {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12};
-; CHECK-NEXT:    ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x64b_unpack_param_5+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x64b_unpack_param_5+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x64b_unpack_param_5+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x64b_unpack_param_5];
+; CHECK-NEXT:    ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x64b_unpack_param_5+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x64b_unpack_param_5+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x64b_unpack_param_5+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x64b_unpack_param_5];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [%r1], {%r29, %r30, %r31, %r32, %r25, %r26, %r27, %r28, %r21, %r22, %r23, %r24, %r17, %r18, %r19, %r20};
-; CHECK-NEXT:    ld.param.v4.u32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x64b_unpack_param_6+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x64b_unpack_param_6+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x64b_unpack_param_6+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x64b_unpack_param_6+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x64b_unpack_param_6+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x64b_unpack_param_6+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x64b_unpack_param_6+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x64b_unpack_param_6];
+; CHECK-NEXT:    ld.param.v4.b32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x64b_unpack_param_6+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x64b_unpack_param_6+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x64b_unpack_param_6+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x64b_unpack_param_6+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x64b_unpack_param_6+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x64b_unpack_param_6+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x64b_unpack_param_6+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x64b_unpack_param_6];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [%r1], {%r61, %r62, %r63, %r64, %r57, %r58, %r59, %r60, %r53, %r54, %r55, %r56, %r49, %r50, %r51, %r52, %r45, %r46, %r47, %r48, %r41, %r42, %r43, %r44, %r37, %r38, %r39, %r40, %r33, %r34, %r35, %r36};
-; CHECK-NEXT:    ld.param.v4.u32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x64b_unpack_param_7+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x64b_unpack_param_7+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x64b_unpack_param_7+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x64b_unpack_param_7+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x64b_unpack_param_7+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x64b_unpack_param_7+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x64b_unpack_param_7+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x64b_unpack_param_7+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x64b_unpack_param_7+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x64b_unpack_param_7+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x64b_unpack_param_7+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x64b_unpack_param_7+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x64b_unpack_param_7+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x64b_unpack_param_7+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x64b_unpack_param_7+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x64b_unpack_param_7];
+; CHECK-NEXT:    ld.param.v4.b32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x64b_unpack_param_7+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x64b_unpack_param_7+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x64b_unpack_param_7+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x64b_unpack_param_7+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x64b_unpack_param_7+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x64b_unpack_param_7+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x64b_unpack_param_7+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x64b_unpack_param_7+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x64b_unpack_param_7+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x64b_unpack_param_7+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x64b_unpack_param_7+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x64b_unpack_param_7+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x64b_unpack_param_7+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x64b_unpack_param_7+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x64b_unpack_param_7+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x64b_unpack_param_7];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [%r1], {%r125, %r126, %r127, %r128, %r121, %r122, %r123, %r124, %r117, %r118, %r119, %r120, %r113, %r114, %r115, %r116, %r109, %r110, %r111, %r112, %r105, %r106, %r107, %r108, %r101, %r102, %r103, %r104, %r97, %r98, %r99, %r100, %r93, %r94, %r95, %r96, %r89, %r90, %r91, %r92, %r85, %r86, %r87, %r88, %r81, %r82, %r83, %r84, %r77, %r78, %r79, %r80, %r73, %r74, %r75, %r76, %r69, %r70, %r71, %r72, %r65, %r66, %r67, %r68};
-; CHECK-NEXT:    ld.param.v4.u32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x64b_unpack_param_8+496];
-; CHECK-NEXT:    ld.param.v4.u32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x64b_unpack_param_8+480];
-; CHECK-NEXT:    ld.param.v4.u32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x64b_unpack_param_8+464];
-; CHECK-NEXT:    ld.param.v4.u32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x64b_unpack_param_8+448];
-; CHECK-NEXT:    ld.param.v4.u32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x64b_unpack_param_8+432];
-; CHECK-NEXT:    ld.param.v4.u32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x64b_unpack_param_8+416];
-; CHECK-NEXT:    ld.param.v4.u32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x64b_unpack_param_8+400];
-; CHECK-NEXT:    ld.param.v4.u32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x64b_unpack_param_8+384];
-; CHECK-NEXT:    ld.param.v4.u32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x64b_unpack_param_8+368];
-; CHECK-NEXT:    ld.param.v4.u32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x64b_unpack_param_8+352];
-; CHECK-NEXT:    ld.param.v4.u32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x64b_unpack_param_8+336];
-; CHECK-NEXT:    ld.param.v4.u32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x64b_unpack_param_8+320];
-; CHECK-NEXT:    ld.param.v4.u32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x64b_unpack_param_8+304];
-; CHECK-NEXT:    ld.param.v4.u32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x64b_unpack_param_8+288];
-; CHECK-NEXT:    ld.param.v4.u32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x64b_unpack_param_8+272];
-; CHECK-NEXT:    ld.param.v4.u32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x64b_unpack_param_8+256];
-; CHECK-NEXT:    ld.param.v4.u32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x64b_unpack_param_8+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x64b_unpack_param_8+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x64b_unpack_param_8+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x64b_unpack_param_8+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x64b_unpack_param_8+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x64b_unpack_param_8+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x64b_unpack_param_8+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x64b_unpack_param_8+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x64b_unpack_param_8+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x64b_unpack_param_8+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x64b_unpack_param_8+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x64b_unpack_param_8+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x64b_unpack_param_8+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x64b_unpack_param_8+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x64b_unpack_param_8+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x64b_unpack_param_8];
+; CHECK-NEXT:    ld.param.v4.b32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x64b_unpack_param_8+496];
+; CHECK-NEXT:    ld.param.v4.b32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x64b_unpack_param_8+480];
+; CHECK-NEXT:    ld.param.v4.b32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x64b_unpack_param_8+464];
+; CHECK-NEXT:    ld.param.v4.b32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x64b_unpack_param_8+448];
+; CHECK-NEXT:    ld.param.v4.b32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x64b_unpack_param_8+432];
+; CHECK-NEXT:    ld.param.v4.b32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x64b_unpack_param_8+416];
+; CHECK-NEXT:    ld.param.v4.b32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x64b_unpack_param_8+400];
+; CHECK-NEXT:    ld.param.v4.b32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x64b_unpack_param_8+384];
+; CHECK-NEXT:    ld.param.v4.b32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x64b_unpack_param_8+368];
+; CHECK-NEXT:    ld.param.v4.b32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x64b_unpack_param_8+352];
+; CHECK-NEXT:    ld.param.v4.b32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x64b_unpack_param_8+336];
+; CHECK-NEXT:    ld.param.v4.b32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x64b_unpack_param_8+320];
+; CHECK-NEXT:    ld.param.v4.b32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x64b_unpack_param_8+304];
+; CHECK-NEXT:    ld.param.v4.b32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x64b_unpack_param_8+288];
+; CHECK-NEXT:    ld.param.v4.b32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x64b_unpack_param_8+272];
+; CHECK-NEXT:    ld.param.v4.b32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x64b_unpack_param_8+256];
+; CHECK-NEXT:    ld.param.v4.b32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x64b_unpack_param_8+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x64b_unpack_param_8+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x64b_unpack_param_8+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x64b_unpack_param_8+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x64b_unpack_param_8+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x64b_unpack_param_8+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x64b_unpack_param_8+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x64b_unpack_param_8+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x64b_unpack_param_8+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x64b_unpack_param_8+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x64b_unpack_param_8+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x64b_unpack_param_8+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x64b_unpack_param_8+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x64b_unpack_param_8+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x64b_unpack_param_8+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x64b_unpack_param_8];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [%r1], {%r253, %r254, %r255, %r256, %r249, %r250, %r251, %r252, %r245, %r246, %r247, %r248, %r241, %r242, %r243, %r244, %r237, %r238, %r239, %r240, %r233, %r234, %r235, %r236, %r229, %r230, %r231, %r232, %r225, %r226, %r227, %r228, %r221, %r222, %r223, %r224, %r217, %r218, %r219, %r220, %r213, %r214, %r215, %r216, %r209, %r210, %r211, %r212, %r205, %r206, %r207, %r208, %r201, %r202, %r203, %r204, %r197, %r198, %r199, %r200, %r193, %r194, %r195, %r196, %r189, %r190, %r191, %r192, %r185, %r186, %r187, %r188, %r181, %r182, %r183, %r184, %r177, %r178, %r179, %r180, %r173, %r174, %r175, %r176, %r169, %r170, %r171, %r172, %r165, %r166, %r167, %r168, %r161, %r162, %r163, %r164, %r157, %r158, %r159, %r160, %r153, %r154, %r155, %r156, %r149, %r150, %r151, %r152, %r145, %r146, %r147, %r148, %r141, %r142, %r143, %r144, %r137, %r138, %r139, %r140, %r133, %r134, %r135, %r136, %r129, %r130, %r131, %r132};
 ; CHECK-NEXT:    ret;
   tail call void @llvm.nvvm.tcgen05.st.16x64b.x1(ptr addrspace(6) %taddr, i32 %stv1, i1 1)
@@ -211,77 +211,77 @@ define void @nvvm_tcgen05_st_16x128b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i3
 ; CHECK-NEXT:    .reg .b32 %r<256>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_st_16x128b_param_0];
-; CHECK-NEXT:    ld.param.v2.u32 {%r2, %r3}, [nvvm_tcgen05_st_16x128b_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_st_16x128b_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r2, %r3}, [nvvm_tcgen05_st_16x128b_param_2];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x1.b32 [%r1], {%r2, %r3};
-; CHECK-NEXT:    ld.param.v4.u32 {%r4, %r5, %r6, %r7}, [nvvm_tcgen05_st_16x128b_param_3];
+; CHECK-NEXT:    ld.param.v4.b32 {%r4, %r5, %r6, %r7}, [nvvm_tcgen05_st_16x128b_param_3];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x2.b32 [%r1], {%r4, %r5, %r6, %r7};
-; CHECK-NEXT:    ld.param.v4.u32 {%r8, %r9, %r10, %r11}, [nvvm_tcgen05_st_16x128b_param_4+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r12, %r13, %r14, %r15}, [nvvm_tcgen05_st_16x128b_param_4];
+; CHECK-NEXT:    ld.param.v4.b32 {%r8, %r9, %r10, %r11}, [nvvm_tcgen05_st_16x128b_param_4+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r12, %r13, %r14, %r15}, [nvvm_tcgen05_st_16x128b_param_4];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x4.b32 [%r1], {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11};
-; CHECK-NEXT:    ld.param.v4.u32 {%r16, %r17, %r18, %r19}, [nvvm_tcgen05_st_16x128b_param_5+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r20, %r21, %r22, %r23}, [nvvm_tcgen05_st_16x128b_param_5+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r24, %r25, %r26, %r27}, [nvvm_tcgen05_st_16x128b_param_5+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r28, %r29, %r30, %r31}, [nvvm_tcgen05_st_16x128b_param_5];
+; CHECK-NEXT:    ld.param.v4.b32 {%r16, %r17, %r18, %r19}, [nvvm_tcgen05_st_16x128b_param_5+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r20, %r21, %r22, %r23}, [nvvm_tcgen05_st_16x128b_param_5+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r24, %r25, %r26, %r27}, [nvvm_tcgen05_st_16x128b_param_5+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r28, %r29, %r30, %r31}, [nvvm_tcgen05_st_16x128b_param_5];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x8.b32 [%r1], {%r28, %r29, %r30, %r31, %r24, %r25, %r26, %r27, %r20, %r21, %r22, %r23, %r16, %r17, %r18, %r19};
-; CHECK-NEXT:    ld.param.v4.u32 {%r32, %r33, %r34, %r35}, [nvvm_tcgen05_st_16x128b_param_6+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r36, %r37, %r38, %r39}, [nvvm_tcgen05_st_16x128b_param_6+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r40, %r41, %r42, %r43}, [nvvm_tcgen05_st_16x128b_param_6+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r44, %r45, %r46, %r47}, [nvvm_tcgen05_st_16x128b_param_6+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r48, %r49, %r50, %r51}, [nvvm_tcgen05_st_16x128b_param_6+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r52, %r53, %r54, %r55}, [nvvm_tcgen05_st_16x128b_param_6+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r56, %r57, %r58, %r59}, [nvvm_tcgen05_st_16x128b_param_6+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r60, %r61, %r62, %r63}, [nvvm_tcgen05_st_16x128b_param_6];
+; CHECK-NEXT:    ld.param.v4.b32 {%r32, %r33, %r34, %r35}, [nvvm_tcgen05_st_16x128b_param_6+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r36, %r37, %r38, %r39}, [nvvm_tcgen05_st_16x128b_param_6+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r40, %r41, %r42, %r43}, [nvvm_tcgen05_st_16x128b_param_6+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r44, %r45, %r46, %r47}, [nvvm_tcgen05_st_16x128b_param_6+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r48, %r49, %r50, %r51}, [nvvm_tcgen05_st_16x128b_param_6+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r52, %r53, %r54, %r55}, [nvvm_tcgen05_st_16x128b_param_6+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r56, %r57, %r58, %r59}, [nvvm_tcgen05_st_16x128b_param_6+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r60, %r61, %r62, %r63}, [nvvm_tcgen05_st_16x128b_param_6];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x16.b32 [%r1], {%r60, %r61, %r62, %r63, %r56, %r57, %r58, %r59, %r52, %r53, %r54, %r55, %r48, %r49, %r50, %r51, %r44, %r45, %r46, %r47, %r40, %r41, %r42, %r43, %r36, %r37, %r38, %r39, %r32, %r33, %r34, %r35};
-; CHECK-NEXT:    ld.param.v4.u32 {%r64, %r65, %r66, %r67}, [nvvm_tcgen05_st_16x128b_param_7+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r68, %r69, %r70, %r71}, [nvvm_tcgen05_st_16x128b_param_7+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r72, %r73, %r74, %r75}, [nvvm_tcgen05_st_16x128b_param_7+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r76, %r77, %r78, %r79}, [nvvm_tcgen05_st_16x128b_param_7+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r80, %r81, %r82, %r83}, [nvvm_tcgen05_st_16x128b_param_7+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r84, %r85, %r86, %r87}, [nvvm_tcgen05_st_16x128b_param_7+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r88, %r89, %r90, %r91}, [nvvm_tcgen05_st_16x128b_param_7+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r92, %r93, %r94, %r95}, [nvvm_tcgen05_st_16x128b_param_7+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r96, %r97, %r98, %r99}, [nvvm_tcgen05_st_16x128b_param_7+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r100, %r101, %r102, %r103}, [nvvm_tcgen05_st_16x128b_param_7+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r104, %r105, %r106, %r107}, [nvvm_tcgen05_st_16x128b_param_7+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r108, %r109, %r110, %r111}, [nvvm_tcgen05_st_16x128b_param_7+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r112, %r113, %r114, %r115}, [nvvm_tcgen05_st_16x128b_param_7+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r116, %r117, %r118, %r119}, [nvvm_tcgen05_st_16x128b_param_7+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r120, %r121, %r122, %r123}, [nvvm_tcgen05_st_16x128b_param_7+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r124, %r125, %r126, %r127}, [nvvm_tcgen05_st_16x128b_param_7];
+; CHECK-NEXT:    ld.param.v4.b32 {%r64, %r65, %r66, %r67}, [nvvm_tcgen05_st_16x128b_param_7+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r68, %r69, %r70, %r71}, [nvvm_tcgen05_st_16x128b_param_7+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r72, %r73, %r74, %r75}, [nvvm_tcgen05_st_16x128b_param_7+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r76, %r77, %r78, %r79}, [nvvm_tcgen05_st_16x128b_param_7+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r80, %r81, %r82, %r83}, [nvvm_tcgen05_st_16x128b_param_7+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r84, %r85, %r86, %r87}, [nvvm_tcgen05_st_16x128b_param_7+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r88, %r89, %r90, %r91}, [nvvm_tcgen05_st_16x128b_param_7+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r92, %r93, %r94, %r95}, [nvvm_tcgen05_st_16x128b_param_7+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r96, %r97, %r98, %r99}, [nvvm_tcgen05_st_16x128b_param_7+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r100, %r101, %r102, %r103}, [nvvm_tcgen05_st_16x128b_param_7+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r104, %r105, %r106, %r107}, [nvvm_tcgen05_st_16x128b_param_7+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r108, %r109, %r110, %r111}, [nvvm_tcgen05_st_16x128b_param_7+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r112, %r113, %r114, %r115}, [nvvm_tcgen05_st_16x128b_param_7+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r116, %r117, %r118, %r119}, [nvvm_tcgen05_st_16x128b_param_7+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r120, %r121, %r122, %r123}, [nvvm_tcgen05_st_16x128b_param_7+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r124, %r125, %r126, %r127}, [nvvm_tcgen05_st_16x128b_param_7];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x32.b32 [%r1], {%r124, %r125, %r126, %r127, %r120, %r121, %r122, %r123, %r116, %r117, %r118, %r119, %r112, %r113, %r114, %r115, %r108, %r109, %r110, %r111, %r104, %r105, %r106, %r107, %r100, %r101, %r102, %r103, %r96, %r97, %r98, %r99, %r92, %r93, %r94, %r95, %r88, %r89, %r90, %r91, %r84, %r85, %r86, %r87, %r80, %r81, %r82, %r83, %r76, %r77, %r78, %r79, %r72, %r73, %r74, %r75, %r68, %r69, %r70, %r71, %r64, %r65, %r66, %r67};
-; CHECK-NEXT:    ld.param.v4.u32 {%r128, %r129, %r130, %r131}, [nvvm_tcgen05_st_16x128b_param_8+496];
-; CHECK-NEXT:    ld.param.v4.u32 {%r132, %r133, %r134, %r135}, [nvvm_tcgen05_st_16x128b_param_8+480];
-; CHECK-NEXT:    ld.param.v4.u32 {%r136, %r137, %r138, %r139}, [nvvm_tcgen05_st_16x128b_param_8+464];
-; CHECK-NEXT:    ld.param.v4.u32 {%r140, %r141, %r142, %r143}, [nvvm_tcgen05_st_16x128b_param_8+448];
-; CHECK-NEXT:    ld.param.v4.u32 {%r144, %r145, %r146, %r147}, [nvvm_tcgen05_st_16x128b_param_8+432];
-; CHECK-NEXT:    ld.param.v4.u32 {%r148, %r149, %r150, %r151}, [nvvm_tcgen05_st_16x128b_param_8+416];
-; CHECK-NEXT:    ld.param.v4.u32 {%r152, %r153, %r154, %r155}, [nvvm_tcgen05_st_16x128b_param_8+400];
-; CHECK-NEXT:    ld.param.v4.u32 {%r156, %r157, %r158, %r159}, [nvvm_tcgen05_st_16x128b_param_8+384];
-; CHECK-NEXT:    ld.param.v4.u32 {%r160, %r161, %r162, %r163}, [nvvm_tcgen05_st_16x128b_param_8+368];
-; CHECK-NEXT:    ld.param.v4.u32 {%r164, %r165, %r166, %r167}, [nvvm_tcgen05_st_16x128b_param_8+352];
-; CHECK-NEXT:    ld.param.v4.u32 {%r168, %r169, %r170, %r171}, [nvvm_tcgen05_st_16x128b_param_8+336];
-; CHECK-NEXT:    ld.param.v4.u32 {%r172, %r173, %r174, %r175}, [nvvm_tcgen05_st_16x128b_param_8+320];
-; CHECK-NEXT:    ld.param.v4.u32 {%r176, %r177, %r178, %r179}, [nvvm_tcgen05_st_16x128b_param_8+304];
-; CHECK-NEXT:    ld.param.v4.u32 {%r180, %r181, %r182, %r183}, [nvvm_tcgen05_st_16x128b_param_8+288];
-; CHECK-NEXT:    ld.param.v4.u32 {%r184, %r185, %r186, %r187}, [nvvm_tcgen05_st_16x128b_param_8+272];
-; CHECK-NEXT:    ld.param.v4.u32 {%r188, %r189, %r190, %r191}, [nvvm_tcgen05_st_16x128b_param_8+256];
-; CHECK-NEXT:    ld.param.v4.u32 {%r192, %r193, %r194, %r195}, [nvvm_tcgen05_st_16x128b_param_8+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r196, %r197, %r198, %r199}, [nvvm_tcgen05_st_16x128b_param_8+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r200, %r201, %r202, %r203}, [nvvm_tcgen05_st_16x128b_param_8+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r204, %r205, %r206, %r207}, [nvvm_tcgen05_st_16x128b_param_8+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r208, %r209, %r210, %r211}, [nvvm_tcgen05_st_16x128b_param_8+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r212, %r213, %r214, %r215}, [nvvm_tcgen05_st_16x128b_param_8+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r216, %r217, %r218, %r219}, [nvvm_tcgen05_st_16x128b_param_8+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r220, %r221, %r222, %r223}, [nvvm_tcgen05_st_16x128b_param_8+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r224, %r225, %r226, %r227}, [nvvm_tcgen05_st_16x128b_param_8+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r228, %r229, %r230, %r231}, [nvvm_tcgen05_st_16x128b_param_8+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r232, %r233, %r234, %r235}, [nvvm_tcgen05_st_16x128b_param_8+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r236, %r237, %r238, %r239}, [nvvm_tcgen05_st_16x128b_param_8+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r240, %r241, %r242, %r243}, [nvvm_tcgen05_st_16x128b_param_8+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r244, %r245, %r246, %r247}, [nvvm_tcgen05_st_16x128b_param_8+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r248, %r249, %r250, %r251}, [nvvm_tcgen05_st_16x128b_param_8+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r252, %r253, %r254, %r255}, [nvvm_tcgen05_st_16x128b_param_8];
+; CHECK-NEXT:    ld.param.v4.b32 {%r128, %r129, %r130, %r131}, [nvvm_tcgen05_st_16x128b_param_8+496];
+; CHECK-NEXT:    ld.param.v4.b32 {%r132, %r133, %r134, %r135}, [nvvm_tcgen05_st_16x128b_param_8+480];
+; CHECK-NEXT:    ld.param.v4.b32 {%r136, %r137, %r138, %r139}, [nvvm_tcgen05_st_16x128b_param_8+464];
+; CHECK-NEXT:    ld.param.v4.b32 {%r140, %r141, %r142, %r143}, [nvvm_tcgen05_st_16x128b_param_8+448];
+; CHECK-NEXT:    ld.param.v4.b32 {%r144, %r145, %r146, %r147}, [nvvm_tcgen05_st_16x128b_param_8+432];
+; CHECK-NEXT:    ld.param.v4.b32 {%r148, %r149, %r150, %r151}, [nvvm_tcgen05_st_16x128b_param_8+416];
+; CHECK-NEXT:    ld.param.v4.b32 {%r152, %r153, %r154, %r155}, [nvvm_tcgen05_st_16x128b_param_8+400];
+; CHECK-NEXT:    ld.param.v4.b32 {%r156, %r157, %r158, %r159}, [nvvm_tcgen05_st_16x128b_param_8+384];
+; CHECK-NEXT:    ld.param.v4.b32 {%r160, %r161, %r162, %r163}, [nvvm_tcgen05_st_16x128b_param_8+368];
+; CHECK-NEXT:    ld.param.v4.b32 {%r164, %r165, %r166, %r167}, [nvvm_tcgen05_st_16x128b_param_8+352];
+; CHECK-NEXT:    ld.param.v4.b32 {%r168, %r169, %r170, %r171}, [nvvm_tcgen05_st_16x128b_param_8+336];
+; CHECK-NEXT:    ld.param.v4.b32 {%r172, %r173, %r174, %r175}, [nvvm_tcgen05_st_16x128b_param_8+320];
+; CHECK-NEXT:    ld.param.v4.b32 {%r176, %r177, %r178, %r179}, [nvvm_tcgen05_st_16x128b_param_8+304];
+; CHECK-NEXT:    ld.param.v4.b32 {%r180, %r181, %r182, %r183}, [nvvm_tcgen05_st_16x128b_param_8+288];
+; CHECK-NEXT:    ld.param.v4.b32 {%r184, %r185, %r186, %r187}, [nvvm_tcgen05_st_16x128b_param_8+272];
+; CHECK-NEXT:    ld.param.v4.b32 {%r188, %r189, %r190, %r191}, [nvvm_tcgen05_st_16x128b_param_8+256];
+; CHECK-NEXT:    ld.param.v4.b32 {%r192, %r193, %r194, %r195}, [nvvm_tcgen05_st_16x128b_param_8+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r196, %r197, %r198, %r199}, [nvvm_tcgen05_st_16x128b_param_8+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r200, %r201, %r202, %r203}, [nvvm_tcgen05_st_16x128b_param_8+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r204, %r205, %r206, %r207}, [nvvm_tcgen05_st_16x128b_param_8+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r208, %r209, %r210, %r211}, [nvvm_tcgen05_st_16x128b_param_8+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r212, %r213, %r214, %r215}, [nvvm_tcgen05_st_16x128b_param_8+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r216, %r217, %r218, %r219}, [nvvm_tcgen05_st_16x128b_param_8+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r220, %r221, %r222, %r223}, [nvvm_tcgen05_st_16x128b_param_8+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r224, %r225, %r226, %r227}, [nvvm_tcgen05_st_16x128b_param_8+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r228, %r229, %r230, %r231}, [nvvm_tcgen05_st_16x128b_param_8+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r232, %r233, %r234, %r235}, [nvvm_tcgen05_st_16x128b_param_8+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r236, %r237, %r238, %r239}, [nvvm_tcgen05_st_16x128b_param_8+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r240, %r241, %r242, %r243}, [nvvm_tcgen05_st_16x128b_param_8+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r244, %r245, %r246, %r247}, [nvvm_tcgen05_st_16x128b_param_8+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r248, %r249, %r250, %r251}, [nvvm_tcgen05_st_16x128b_param_8+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r252, %r253, %r254, %r255}, [nvvm_tcgen05_st_16x128b_param_8];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x64.b32 [%r1], {%r252, %r253, %r254, %r255, %r248, %r249, %r250, %r251, %r244, %r245, %r246, %r247, %r240, %r241, %r242, %r243, %r236, %r237, %r238, %r239, %r232, %r233, %r234, %r235, %r228, %r229, %r230, %r231, %r224, %r225, %r226, %r227, %r220, %r221, %r222, %r223, %r216, %r217, %r218, %r219, %r212, %r213, %r214, %r215, %r208, %r209, %r210, %r211, %r204, %r205, %r206, %r207, %r200, %r201, %r202, %r203, %r196, %r197, %r198, %r199, %r192, %r193, %r194, %r195, %r188, %r189, %r190, %r191, %r184, %r185, %r186, %r187, %r180, %r181, %r182, %r183, %r176, %r177, %r178, %r179, %r172, %r173, %r174, %r175, %r168, %r169, %r170, %r171, %r164, %r165, %r166, %r167, %r160, %r161, %r162, %r163, %r156, %r157, %r158, %r159, %r152, %r153, %r154, %r155, %r148, %r149, %r150, %r151, %r144, %r145, %r146, %r147, %r140, %r141, %r142, %r143, %r136, %r137, %r138, %r139, %r132, %r133, %r134, %r135, %r128, %r129, %r130, %r131};
 ; CHECK-NEXT:    ret;
   tail call void @llvm.nvvm.tcgen05.st.16x128b.x1(ptr addrspace(6) %taddr, <2 x i32> %stv2, i1 0)
@@ -307,77 +307,77 @@ define void @nvvm_tcgen05_st_16x128b_unpack(ptr addrspace(6) %taddr, i32 %stv1,
 ; CHECK-NEXT:    .reg .b32 %r<256>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_st_16x128b_unpack_param_0];
-; CHECK-NEXT:    ld.param.v2.u32 {%r2, %r3}, [nvvm_tcgen05_st_16x128b_unpack_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_st_16x128b_unpack_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r2, %r3}, [nvvm_tcgen05_st_16x128b_unpack_param_2];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [%r1], {%r2, %r3};
-; CHECK-NEXT:    ld.param.v4.u32 {%r4, %r5, %r6, %r7}, [nvvm_tcgen05_st_16x128b_unpack_param_3];
+; CHECK-NEXT:    ld.param.v4.b32 {%r4, %r5, %r6, %r7}, [nvvm_tcgen05_st_16x128b_unpack_param_3];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [%r1], {%r4, %r5, %r6, %r7};
-; CHECK-NEXT:    ld.param.v4.u32 {%r8, %r9, %r10, %r11}, [nvvm_tcgen05_st_16x128b_unpack_param_4+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r12, %r13, %r14, %r15}, [nvvm_tcgen05_st_16x128b_unpack_param_4];
+; CHECK-NEXT:    ld.param.v4.b32 {%r8, %r9, %r10, %r11}, [nvvm_tcgen05_st_16x128b_unpack_param_4+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r12, %r13, %r14, %r15}, [nvvm_tcgen05_st_16x128b_unpack_param_4];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [%r1], {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11};
-; CHECK-NEXT:    ld.param.v4.u32 {%r16, %r17, %r18, %r19}, [nvvm_tcgen05_st_16x128b_unpack_param_5+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r20, %r21, %r22, %r23}, [nvvm_tcgen05_st_16x128b_unpack_param_5+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r24, %r25, %r26, %r27}, [nvvm_tcgen05_st_16x128b_unpack_param_5+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r28, %r29, %r30, %r31}, [nvvm_tcgen05_st_16x128b_unpack_param_5];
+; CHECK-NEXT:    ld.param.v4.b32 {%r16, %r17, %r18, %r19}, [nvvm_tcgen05_st_16x128b_unpack_param_5+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r20, %r21, %r22, %r23}, [nvvm_tcgen05_st_16x128b_unpack_param_5+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r24, %r25, %r26, %r27}, [nvvm_tcgen05_st_16x128b_unpack_param_5+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r28, %r29, %r30, %r31}, [nvvm_tcgen05_st_16x128b_unpack_param_5];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [%r1], {%r28, %r29, %r30, %r31, %r24, %r25, %r26, %r27, %r20, %r21, %r22, %r23, %r16, %r17, %r18, %r19};
-; CHECK-NEXT:    ld.param.v4.u32 {%r32, %r33, %r34, %r35}, [nvvm_tcgen05_st_16x128b_unpack_param_6+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r36, %r37, %r38, %r39}, [nvvm_tcgen05_st_16x128b_unpack_param_6+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r40, %r41, %r42, %r43}, [nvvm_tcgen05_st_16x128b_unpack_param_6+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r44, %r45, %r46, %r47}, [nvvm_tcgen05_st_16x128b_unpack_param_6+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r48, %r49, %r50, %r51}, [nvvm_tcgen05_st_16x128b_unpack_param_6+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r52, %r53, %r54, %r55}, [nvvm_tcgen05_st_16x128b_unpack_param_6+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r56, %r57, %r58, %r59}, [nvvm_tcgen05_st_16x128b_unpack_param_6+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r60, %r61, %r62, %r63}, [nvvm_tcgen05_st_16x128b_unpack_param_6];
+; CHECK-NEXT:    ld.param.v4.b32 {%r32, %r33, %r34, %r35}, [nvvm_tcgen05_st_16x128b_unpack_param_6+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r36, %r37, %r38, %r39}, [nvvm_tcgen05_st_16x128b_unpack_param_6+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r40, %r41, %r42, %r43}, [nvvm_tcgen05_st_16x128b_unpack_param_6+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r44, %r45, %r46, %r47}, [nvvm_tcgen05_st_16x128b_unpack_param_6+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r48, %r49, %r50, %r51}, [nvvm_tcgen05_st_16x128b_unpack_param_6+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r52, %r53, %r54, %r55}, [nvvm_tcgen05_st_16x128b_unpack_param_6+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r56, %r57, %r58, %r59}, [nvvm_tcgen05_st_16x128b_unpack_param_6+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r60, %r61, %r62, %r63}, [nvvm_tcgen05_st_16x128b_unpack_param_6];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [%r1], {%r60, %r61, %r62, %r63, %r56, %r57, %r58, %r59, %r52, %r53, %r54, %r55, %r48, %r49, %r50, %r51, %r44, %r45, %r46, %r47, %r40, %r41, %r42, %r43, %r36, %r37, %r38, %r39, %r32, %r33, %r34, %r35};
-; CHECK-NEXT:    ld.param.v4.u32 {%r64, %r65, %r66, %r67}, [nvvm_tcgen05_st_16x128b_unpack_param_7+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r68, %r69, %r70, %r71}, [nvvm_tcgen05_st_16x128b_unpack_param_7+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r72, %r73, %r74, %r75}, [nvvm_tcgen05_st_16x128b_unpack_param_7+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r76, %r77, %r78, %r79}, [nvvm_tcgen05_st_16x128b_unpack_param_7+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r80, %r81, %r82, %r83}, [nvvm_tcgen05_st_16x128b_unpack_param_7+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r84, %r85, %r86, %r87}, [nvvm_tcgen05_st_16x128b_unpack_param_7+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r88, %r89, %r90, %r91}, [nvvm_tcgen05_st_16x128b_unpack_param_7+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r92, %r93, %r94, %r95}, [nvvm_tcgen05_st_16x128b_unpack_param_7+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r96, %r97, %r98, %r99}, [nvvm_tcgen05_st_16x128b_unpack_param_7+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r100, %r101, %r102, %r103}, [nvvm_tcgen05_st_16x128b_unpack_param_7+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r104, %r105, %r106, %r107}, [nvvm_tcgen05_st_16x128b_unpack_param_7+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r108, %r109, %r110, %r111}, [nvvm_tcgen05_st_16x128b_unpack_param_7+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r112, %r113, %r114, %r115}, [nvvm_tcgen05_st_16x128b_unpack_param_7+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r116, %r117, %r118, %r119}, [nvvm_tcgen05_st_16x128b_unpack_param_7+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r120, %r121, %r122, %r123}, [nvvm_tcgen05_st_16x128b_unpack_param_7+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r124, %r125, %r126, %r127}, [nvvm_tcgen05_st_16x128b_unpack_param_7];
+; CHECK-NEXT:    ld.param.v4.b32 {%r64, %r65, %r66, %r67}, [nvvm_tcgen05_st_16x128b_unpack_param_7+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r68, %r69, %r70, %r71}, [nvvm_tcgen05_st_16x128b_unpack_param_7+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r72, %r73, %r74, %r75}, [nvvm_tcgen05_st_16x128b_unpack_param_7+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r76, %r77, %r78, %r79}, [nvvm_tcgen05_st_16x128b_unpack_param_7+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r80, %r81, %r82, %r83}, [nvvm_tcgen05_st_16x128b_unpack_param_7+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r84, %r85, %r86, %r87}, [nvvm_tcgen05_st_16x128b_unpack_param_7+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r88, %r89, %r90, %r91}, [nvvm_tcgen05_st_16x128b_unpack_param_7+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r92, %r93, %r94, %r95}, [nvvm_tcgen05_st_16x128b_unpack_param_7+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r96, %r97, %r98, %r99}, [nvvm_tcgen05_st_16x128b_unpack_param_7+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r100, %r101, %r102, %r103}, [nvvm_tcgen05_st_16x128b_unpack_param_7+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r104, %r105, %r106, %r107}, [nvvm_tcgen05_st_16x128b_unpack_param_7+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r108, %r109, %r110, %r111}, [nvvm_tcgen05_st_16x128b_unpack_param_7+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r112, %r113, %r114, %r115}, [nvvm_tcgen05_st_16x128b_unpack_param_7+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r116, %r117, %r118, %r119}, [nvvm_tcgen05_st_16x128b_unpack_param_7+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r120, %r121, %r122, %r123}, [nvvm_tcgen05_st_16x128b_unpack_param_7+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r124, %r125, %r126, %r127}, [nvvm_tcgen05_st_16x128b_unpack_param_7];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [%r1], {%r124, %r125, %r126, %r127, %r120, %r121, %r122, %r123, %r116, %r117, %r118, %r119, %r112, %r113, %r114, %r115, %r108, %r109, %r110, %r111, %r104, %r105, %r106, %r107, %r100, %r101, %r102, %r103, %r96, %r97, %r98, %r99, %r92, %r93, %r94, %r95, %r88, %r89, %r90, %r91, %r84, %r85, %r86, %r87, %r80, %r81, %r82, %r83, %r76, %r77, %r78, %r79, %r72, %r73, %r74, %r75, %r68, %r69, %r70, %r71, %r64, %r65, %r66, %r67};
-; CHECK-NEXT:    ld.param.v4.u32 {%r128, %r129, %r130, %r131}, [nvvm_tcgen05_st_16x128b_unpack_param_8+496];
-; CHECK-NEXT:    ld.param.v4.u32 {%r132, %r133, %r134, %r135}, [nvvm_tcgen05_st_16x128b_unpack_param_8+480];
-; CHECK-NEXT:    ld.param.v4.u32 {%r136, %r137, %r138, %r139}, [nvvm_tcgen05_st_16x128b_unpack_param_8+464];
-; CHECK-NEXT:    ld.param.v4.u32 {%r140, %r141, %r142, %r143}, [nvvm_tcgen05_st_16x128b_unpack_param_8+448];
-; CHECK-NEXT:    ld.param.v4.u32 {%r144, %r145, %r146, %r147}, [nvvm_tcgen05_st_16x128b_unpack_param_8+432];
-; CHECK-NEXT:    ld.param.v4.u32 {%r148, %r149, %r150, %r151}, [nvvm_tcgen05_st_16x128b_unpack_param_8+416];
-; CHECK-NEXT:    ld.param.v4.u32 {%r152, %r153, %r154, %r155}, [nvvm_tcgen05_st_16x128b_unpack_param_8+400];
-; CHECK-NEXT:    ld.param.v4.u32 {%r156, %r157, %r158, %r159}, [nvvm_tcgen05_st_16x128b_unpack_param_8+384];
-; CHECK-NEXT:    ld.param.v4.u32 {%r160, %r161, %r162, %r163}, [nvvm_tcgen05_st_16x128b_unpack_param_8+368];
-; CHECK-NEXT:    ld.param.v4.u32 {%r164, %r165, %r166, %r167}, [nvvm_tcgen05_st_16x128b_unpack_param_8+352];
-; CHECK-NEXT:    ld.param.v4.u32 {%r168, %r169, %r170, %r171}, [nvvm_tcgen05_st_16x128b_unpack_param_8+336];
-; CHECK-NEXT:    ld.param.v4.u32 {%r172, %r173, %r174, %r175}, [nvvm_tcgen05_st_16x128b_unpack_param_8+320];
-; CHECK-NEXT:    ld.param.v4.u32 {%r176, %r177, %r178, %r179}, [nvvm_tcgen05_st_16x128b_unpack_param_8+304];
-; CHECK-NEXT:    ld.param.v4.u32 {%r180, %r181, %r182, %r183}, [nvvm_tcgen05_st_16x128b_unpack_param_8+288];
-; CHECK-NEXT:    ld.param.v4.u32 {%r184, %r185, %r186, %r187}, [nvvm_tcgen05_st_16x128b_unpack_param_8+272];
-; CHECK-NEXT:    ld.param.v4.u32 {%r188, %r189, %r190, %r191}, [nvvm_tcgen05_st_16x128b_unpack_param_8+256];
-; CHECK-NEXT:    ld.param.v4.u32 {%r192, %r193, %r194, %r195}, [nvvm_tcgen05_st_16x128b_unpack_param_8+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r196, %r197, %r198, %r199}, [nvvm_tcgen05_st_16x128b_unpack_param_8+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r200, %r201, %r202, %r203}, [nvvm_tcgen05_st_16x128b_unpack_param_8+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r204, %r205, %r206, %r207}, [nvvm_tcgen05_st_16x128b_unpack_param_8+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r208, %r209, %r210, %r211}, [nvvm_tcgen05_st_16x128b_unpack_param_8+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r212, %r213, %r214, %r215}, [nvvm_tcgen05_st_16x128b_unpack_param_8+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r216, %r217, %r218, %r219}, [nvvm_tcgen05_st_16x128b_unpack_param_8+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r220, %r221, %r222, %r223}, [nvvm_tcgen05_st_16x128b_unpack_param_8+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r224, %r225, %r226, %r227}, [nvvm_tcgen05_st_16x128b_unpack_param_8+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r228, %r229, %r230, %r231}, [nvvm_tcgen05_st_16x128b_unpack_param_8+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r232, %r233, %r234, %r235}, [nvvm_tcgen05_st_16x128b_unpack_param_8+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r236, %r237, %r238, %r239}, [nvvm_tcgen05_st_16x128b_unpack_param_8+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r240, %r241, %r242, %r243}, [nvvm_tcgen05_st_16x128b_unpack_param_8+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r244, %r245, %r246, %r247}, [nvvm_tcgen05_st_16x128b_unpack_param_8+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r248, %r249, %r250, %r251}, [nvvm_tcgen05_st_16x128b_unpack_param_8+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r252, %r253, %r254, %r255}, [nvvm_tcgen05_st_16x128b_unpack_param_8];
+; CHECK-NEXT:    ld.param.v4.b32 {%r128, %r129, %r130, %r131}, [nvvm_tcgen05_st_16x128b_unpack_param_8+496];
+; CHECK-NEXT:    ld.param.v4.b32 {%r132, %r133, %r134, %r135}, [nvvm_tcgen05_st_16x128b_unpack_param_8+480];
+; CHECK-NEXT:    ld.param.v4.b32 {%r136, %r137, %r138, %r139}, [nvvm_tcgen05_st_16x128b_unpack_param_8+464];
+; CHECK-NEXT:    ld.param.v4.b32 {%r140, %r141, %r142, %r143}, [nvvm_tcgen05_st_16x128b_unpack_param_8+448];
+; CHECK-NEXT:    ld.param.v4.b32 {%r144, %r145, %r146, %r147}, [nvvm_tcgen05_st_16x128b_unpack_param_8+432];
+; CHECK-NEXT:    ld.param.v4.b32 {%r148, %r149, %r150, %r151}, [nvvm_tcgen05_st_16x128b_unpack_param_8+416];
+; CHECK-NEXT:    ld.param.v4.b32 {%r152, %r153, %r154, %r155}, [nvvm_tcgen05_st_16x128b_unpack_param_8+400];
+; CHECK-NEXT:    ld.param.v4.b32 {%r156, %r157, %r158, %r159}, [nvvm_tcgen05_st_16x128b_unpack_param_8+384];
+; CHECK-NEXT:    ld.param.v4.b32 {%r160, %r161, %r162, %r163}, [nvvm_tcgen05_st_16x128b_unpack_param_8+368];
+; CHECK-NEXT:    ld.param.v4.b32 {%r164, %r165, %r166, %r167}, [nvvm_tcgen05_st_16x128b_unpack_param_8+352];
+; CHECK-NEXT:    ld.param.v4.b32 {%r168, %r169, %r170, %r171}, [nvvm_tcgen05_st_16x128b_unpack_param_8+336];
+; CHECK-NEXT:    ld.param.v4.b32 {%r172, %r173, %r174, %r175}, [nvvm_tcgen05_st_16x128b_unpack_param_8+320];
+; CHECK-NEXT:    ld.param.v4.b32 {%r176, %r177, %r178, %r179}, [nvvm_tcgen05_st_16x128b_unpack_param_8+304];
+; CHECK-NEXT:    ld.param.v4.b32 {%r180, %r181, %r182, %r183}, [nvvm_tcgen05_st_16x128b_unpack_param_8+288];
+; CHECK-NEXT:    ld.param.v4.b32 {%r184, %r185, %r186, %r187}, [nvvm_tcgen05_st_16x128b_unpack_param_8+272];
+; CHECK-NEXT:    ld.param.v4.b32 {%r188, %r189, %r190, %r191}, [nvvm_tcgen05_st_16x128b_unpack_param_8+256];
+; CHECK-NEXT:    ld.param.v4.b32 {%r192, %r193, %r194, %r195}, [nvvm_tcgen05_st_16x128b_unpack_param_8+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r196, %r197, %r198, %r199}, [nvvm_tcgen05_st_16x128b_unpack_param_8+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r200, %r201, %r202, %r203}, [nvvm_tcgen05_st_16x128b_unpack_param_8+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r204, %r205, %r206, %r207}, [nvvm_tcgen05_st_16x128b_unpack_param_8+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r208, %r209, %r210, %r211}, [nvvm_tcgen05_st_16x128b_unpack_param_8+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r212, %r213, %r214, %r215}, [nvvm_tcgen05_st_16x128b_unpack_param_8+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r216, %r217, %r218, %r219}, [nvvm_tcgen05_st_16x128b_unpack_param_8+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r220, %r221, %r222, %r223}, [nvvm_tcgen05_st_16x128b_unpack_param_8+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r224, %r225, %r226, %r227}, [nvvm_tcgen05_st_16x128b_unpack_param_8+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r228, %r229, %r230, %r231}, [nvvm_tcgen05_st_16x128b_unpack_param_8+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r232, %r233, %r234, %r235}, [nvvm_tcgen05_st_16x128b_unpack_param_8+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r236, %r237, %r238, %r239}, [nvvm_tcgen05_st_16x128b_unpack_param_8+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r240, %r241, %r242, %r243}, [nvvm_tcgen05_st_16x128b_unpack_param_8+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r244, %r245, %r246, %r247}, [nvvm_tcgen05_st_16x128b_unpack_param_8+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r248, %r249, %r250, %r251}, [nvvm_tcgen05_st_16x128b_unpack_param_8+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r252, %r253, %r254, %r255}, [nvvm_tcgen05_st_16x128b_unpack_param_8];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [%r1], {%r252, %r253, %r254, %r255, %r248, %r249, %r250, %r251, %r244, %r245, %r246, %r247, %r240, %r241, %r242, %r243, %r236, %r237, %r238, %r239, %r232, %r233, %r234, %r235, %r228, %r229, %r230, %r231, %r224, %r225, %r226, %r227, %r220, %r221, %r222, %r223, %r216, %r217, %r218, %r219, %r212, %r213, %r214, %r215, %r208, %r209, %r210, %r211, %r204, %r205, %r206, %r207, %r200, %r201, %r202, %r203, %r196, %r197, %r198, %r199, %r192, %r193, %r194, %r195, %r188, %r189, %r190, %r191, %r184, %r185, %r186, %r187, %r180, %r181, %r182, %r183, %r176, %r177, %r178, %r179, %r172, %r173, %r174, %r175, %r168, %r169, %r170, %r171, %r164, %r165, %r166, %r167, %r160, %r161, %r162, %r163, %r156, %r157, %r158, %r159, %r152, %r153, %r154, %r155, %r148, %r149, %r150, %r151, %r144, %r145, %r146, %r147, %r140, %r141, %r142, %r143, %r136, %r137, %r138, %r139, %r132, %r133, %r134, %r135, %r128, %r129, %r130, %r131};
 ; CHECK-NEXT:    ret;
   tail call void @llvm.nvvm.tcgen05.st.16x128b.x1(ptr addrspace(6) %taddr, <2 x i32> %stv2, i1 1)
@@ -403,75 +403,75 @@ define void @nvvm_tcgen05_st_16x256b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i3
 ; CHECK-NEXT:    .reg .b32 %r<254>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_st_16x256b_param_0];
-; CHECK-NEXT:    ld.param.v4.u32 {%r2, %r3, %r4, %r5}, [nvvm_tcgen05_st_16x256b_param_3];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_st_16x256b_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r2, %r3, %r4, %r5}, [nvvm_tcgen05_st_16x256b_param_3];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x256b.x1.b32 [%r1], {%r2, %r3, %r4, %r5};
-; CHECK-NEXT:    ld.param.v4.u32 {%r6, %r7, %r8, %r9}, [nvvm_tcgen05_st_16x256b_param_4+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r10, %r11, %r12, %r13}, [nvvm_tcgen05_st_16x256b_param_4];
+; CHECK-NEXT:    ld.param.v4.b32 {%r6, %r7, %r8, %r9}, [nvvm_tcgen05_st_16x256b_param_4+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r10, %r11, %r12, %r13}, [nvvm_tcgen05_st_16x256b_param_4];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x256b.x2.b32 [%r1], {%r10, %r11, %r12, %r13, %r6, %r7, %r8, %r9};
-; CHECK-NEXT:    ld.param.v4.u32 {%r14, %r15, %r16, %r17}, [nvvm_tcgen05_st_16x256b_param_5+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r18, %r19, %r20, %r21}, [nvvm_tcgen05_st_16x256b_param_5+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r22, %r23, %r24, %r25}, [nvvm_tcgen05_st_16x256b_param_5+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r26, %r27, %r28, %r29}, [nvvm_tcgen05_st_16x256b_param_5];
+; CHECK-NEXT:    ld.param.v4.b32 {%r14, %r15, %r16, %r17}, [nvvm_tcgen05_st_16x256b_param_5+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r18, %r19, %r20, %r21}, [nvvm_tcgen05_st_16x256b_param_5+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r22, %r23, %r24, %r25}, [nvvm_tcgen05_st_16x256b_param_5+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r26, %r27, %r28, %r29}, [nvvm_tcgen05_st_16x256b_param_5];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x256b.x4.b32 [%r1], {%r26, %r27, %r28, %r29, %r22, %r23, %r24, %r25, %r18, %r19, %r20, %r21, %r14, %r15, %r16, %r17};
-; CHECK-NEXT:    ld.param.v4.u32 {%r30, %r31, %r32, %r33}, [nvvm_tcgen05_st_16x256b_param_6+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r34, %r35, %r36, %r37}, [nvvm_tcgen05_st_16x256b_param_6+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r38, %r39, %r40, %r41}, [nvvm_tcgen05_st_16x256b_param_6+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r42, %r43, %r44, %r45}, [nvvm_tcgen05_st_16x256b_param_6+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r46, %r47, %r48, %r49}, [nvvm_tcgen05_st_16x256b_param_6+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r50, %r51, %r52, %r53}, [nvvm_tcgen05_st_16x256b_param_6+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r54, %r55, %r56, %r57}, [nvvm_tcgen05_st_16x256b_param_6+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r58, %r59, %r60, %r61}, [nvvm_tcgen05_st_16x256b_param_6];
+; CHECK-NEXT:    ld.param.v4.b32 {%r30, %r31, %r32, %r33}, [nvvm_tcgen05_st_16x256b_param_6+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r34, %r35, %r36, %r37}, [nvvm_tcgen05_st_16x256b_param_6+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r38, %r39, %r40, %r41}, [nvvm_tcgen05_st_16x256b_param_6+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r42, %r43, %r44, %r45}, [nvvm_tcgen05_st_16x256b_param_6+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r46, %r47, %r48, %r49}, [nvvm_tcgen05_st_16x256b_param_6+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r50, %r51, %r52, %r53}, [nvvm_tcgen05_st_16x256b_param_6+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r54, %r55, %r56, %r57}, [nvvm_tcgen05_st_16x256b_param_6+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r58, %r59, %r60, %r61}, [nvvm_tcgen05_st_16x256b_param_6];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x256b.x8.b32 [%r1], {%r58, %r59, %r60, %r61, %r54, %r55, %r56, %r57, %r50, %r51, %r52, %r53, %r46, %r47, %r48, %r49, %r42, %r43, %r44, %r45, %r38, %r39, %r40, %r41, %r34, %r35, %r36, %r37, %r30, %r31, %r32, %r33};
-; CHECK-NEXT:    ld.param.v4.u32 {%r62, %r63, %r64, %r65}, [nvvm_tcgen05_st_16x256b_param_7+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r66, %r67, %r68, %r69}, [nvvm_tcgen05_st_16x256b_param_7+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r70, %r71, %r72, %r73}, [nvvm_tcgen05_st_16x256b_param_7+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r74, %r75, %r76, %r77}, [nvvm_tcgen05_st_16x256b_param_7+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r78, %r79, %r80, %r81}, [nvvm_tcgen05_st_16x256b_param_7+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r82, %r83, %r84, %r85}, [nvvm_tcgen05_st_16x256b_param_7+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r86, %r87, %r88, %r89}, [nvvm_tcgen05_st_16x256b_param_7+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r90, %r91, %r92, %r93}, [nvvm_tcgen05_st_16x256b_param_7+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r94, %r95, %r96, %r97}, [nvvm_tcgen05_st_16x256b_param_7+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r98, %r99, %r100, %r101}, [nvvm_tcgen05_st_16x256b_param_7+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r102, %r103, %r104, %r105}, [nvvm_tcgen05_st_16x256b_param_7+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r106, %r107, %r108, %r109}, [nvvm_tcgen05_st_16x256b_param_7+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r110, %r111, %r112, %r113}, [nvvm_tcgen05_st_16x256b_param_7+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r114, %r115, %r116, %r117}, [nvvm_tcgen05_st_16x256b_param_7+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r118, %r119, %r120, %r121}, [nvvm_tcgen05_st_16x256b_param_7+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r122, %r123, %r124, %r125}, [nvvm_tcgen05_st_16x256b_param_7];
+; CHECK-NEXT:    ld.param.v4.b32 {%r62, %r63, %r64, %r65}, [nvvm_tcgen05_st_16x256b_param_7+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r66, %r67, %r68, %r69}, [nvvm_tcgen05_st_16x256b_param_7+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r70, %r71, %r72, %r73}, [nvvm_tcgen05_st_16x256b_param_7+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r74, %r75, %r76, %r77}, [nvvm_tcgen05_st_16x256b_param_7+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r78, %r79, %r80, %r81}, [nvvm_tcgen05_st_16x256b_param_7+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r82, %r83, %r84, %r85}, [nvvm_tcgen05_st_16x256b_param_7+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r86, %r87, %r88, %r89}, [nvvm_tcgen05_st_16x256b_param_7+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r90, %r91, %r92, %r93}, [nvvm_tcgen05_st_16x256b_param_7+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r94, %r95, %r96, %r97}, [nvvm_tcgen05_st_16x256b_param_7+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r98, %r99, %r100, %r101}, [nvvm_tcgen05_st_16x256b_param_7+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r102, %r103, %r104, %r105}, [nvvm_tcgen05_st_16x256b_param_7+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r106, %r107, %r108, %r109}, [nvvm_tcgen05_st_16x256b_param_7+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r110, %r111, %r112, %r113}, [nvvm_tcgen05_st_16x256b_param_7+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r114, %r115, %r116, %r117}, [nvvm_tcgen05_st_16x256b_param_7+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r118, %r119, %r120, %r121}, [nvvm_tcgen05_st_16x256b_param_7+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r122, %r123, %r124, %r125}, [nvvm_tcgen05_st_16x256b_param_7];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x256b.x16.b32 [%r1], {%r122, %r123, %r124, %r125, %r118, %r119, %r120, %r121, %r114, %r115, %r116, %r117, %r110, %r111, %r112, %r113, %r106, %r107, %r108, %r109, %r102, %r103, %r104, %r105, %r98, %r99, %r100, %r101, %r94, %r95, %r96, %r97, %r90, %r91, %r92, %r93, %r86, %r87, %r88, %r89, %r82, %r83, %r84, %r85, %r78, %r79, %r80, %r81, %r74, %r75, %r76, %r77, %r70, %r71, %r72, %r73, %r66, %r67, %r68, %r69, %r62, %r63, %r64, %r65};
-; CHECK-NEXT:    ld.param.v4.u32 {%r126, %r127, %r128, %r129}, [nvvm_tcgen05_st_16x256b_param_8+496];
-; CHECK-NEXT:    ld.param.v4.u32 {%r130, %r131, %r132, %r133}, [nvvm_tcgen05_st_16x256b_param_8+480];
-; CHECK-NEXT:    ld.param.v4.u32 {%r134, %r135, %r136, %r137}, [nvvm_tcgen05_st_16x256b_param_8+464];
-; CHECK-NEXT:    ld.param.v4.u32 {%r138, %r139, %r140, %r141}, [nvvm_tcgen05_st_16x256b_param_8+448];
-; CHECK-NEXT:    ld.param.v4.u32 {%r142, %r143, %r144, %r145}, [nvvm_tcgen05_st_16x256b_param_8+432];
-; CHECK-NEXT:    ld.param.v4.u32 {%r146, %r147, %r148, %r149}, [nvvm_tcgen05_st_16x256b_param_8+416];
-; CHECK-NEXT:    ld.param.v4.u32 {%r150, %r151, %r152, %r153}, [nvvm_tcgen05_st_16x256b_param_8+400];
-; CHECK-NEXT:    ld.param.v4.u32 {%r154, %r155, %r156, %r157}, [nvvm_tcgen05_st_16x256b_param_8+384];
-; CHECK-NEXT:    ld.param.v4.u32 {%r158, %r159, %r160, %r161}, [nvvm_tcgen05_st_16x256b_param_8+368];
-; CHECK-NEXT:    ld.param.v4.u32 {%r162, %r163, %r164, %r165}, [nvvm_tcgen05_st_16x256b_param_8+352];
-; CHECK-NEXT:    ld.param.v4.u32 {%r166, %r167, %r168, %r169}, [nvvm_tcgen05_st_16x256b_param_8+336];
-; CHECK-NEXT:    ld.param.v4.u32 {%r170, %r171, %r172, %r173}, [nvvm_tcgen05_st_16x256b_param_8+320];
-; CHECK-NEXT:    ld.param.v4.u32 {%r174, %r175, %r176, %r177}, [nvvm_tcgen05_st_16x256b_param_8+304];
-; CHECK-NEXT:    ld.param.v4.u32 {%r178, %r179, %r180, %r181}, [nvvm_tcgen05_st_16x256b_param_8+288];
-; CHECK-NEXT:    ld.param.v4.u32 {%r182, %r183, %r184, %r185}, [nvvm_tcgen05_st_16x256b_param_8+272];
-; CHECK-NEXT:    ld.param.v4.u32 {%r186, %r187, %r188, %r189}, [nvvm_tcgen05_st_16x256b_param_8+256];
-; CHECK-NEXT:    ld.param.v4.u32 {%r190, %r191, %r192, %r193}, [nvvm_tcgen05_st_16x256b_param_8+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r194, %r195, %r196, %r197}, [nvvm_tcgen05_st_16x256b_param_8+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r198, %r199, %r200, %r201}, [nvvm_tcgen05_st_16x256b_param_8+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r202, %r203, %r204, %r205}, [nvvm_tcgen05_st_16x256b_param_8+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r206, %r207, %r208, %r209}, [nvvm_tcgen05_st_16x256b_param_8+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r210, %r211, %r212, %r213}, [nvvm_tcgen05_st_16x256b_param_8+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r214, %r215, %r216, %r217}, [nvvm_tcgen05_st_16x256b_param_8+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r218, %r219, %r220, %r221}, [nvvm_tcgen05_st_16x256b_param_8+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r222, %r223, %r224, %r225}, [nvvm_tcgen05_st_16x256b_param_8+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r226, %r227, %r228, %r229}, [nvvm_tcgen05_st_16x256b_param_8+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r230, %r231, %r232, %r233}, [nvvm_tcgen05_st_16x256b_param_8+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r234, %r235, %r236, %r237}, [nvvm_tcgen05_st_16x256b_param_8+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r238, %r239, %r240, %r241}, [nvvm_tcgen05_st_16x256b_param_8+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r242, %r243, %r244, %r245}, [nvvm_tcgen05_st_16x256b_param_8+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r246, %r247, %r248, %r249}, [nvvm_tcgen05_st_16x256b_param_8+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r250, %r251, %r252, %r253}, [nvvm_tcgen05_st_16x256b_param_8];
+; CHECK-NEXT:    ld.param.v4.b32 {%r126, %r127, %r128, %r129}, [nvvm_tcgen05_st_16x256b_param_8+496];
+; CHECK-NEXT:    ld.param.v4.b32 {%r130, %r131, %r132, %r133}, [nvvm_tcgen05_st_16x256b_param_8+480];
+; CHECK-NEXT:    ld.param.v4.b32 {%r134, %r135, %r136, %r137}, [nvvm_tcgen05_st_16x256b_param_8+464];
+; CHECK-NEXT:    ld.param.v4.b32 {%r138, %r139, %r140, %r141}, [nvvm_tcgen05_st_16x256b_param_8+448];
+; CHECK-NEXT:    ld.param.v4.b32 {%r142, %r143, %r144, %r145}, [nvvm_tcgen05_st_16x256b_param_8+432];
+; CHECK-NEXT:    ld.param.v4.b32 {%r146, %r147, %r148, %r149}, [nvvm_tcgen05_st_16x256b_param_8+416];
+; CHECK-NEXT:    ld.param.v4.b32 {%r150, %r151, %r152, %r153}, [nvvm_tcgen05_st_16x256b_param_8+400];
+; CHECK-NEXT:    ld.param.v4.b32 {%r154, %r155, %r156, %r157}, [nvvm_tcgen05_st_16x256b_param_8+384];
+; CHECK-NEXT:    ld.param.v4.b32 {%r158, %r159, %r160, %r161}, [nvvm_tcgen05_st_16x256b_param_8+368];
+; CHECK-NEXT:    ld.param.v4.b32 {%r162, %r163, %r164, %r165}, [nvvm_tcgen05_st_16x256b_param_8+352];
+; CHECK-NEXT:    ld.param.v4.b32 {%r166, %r167, %r168, %r169}, [nvvm_tcgen05_st_16x256b_param_8+336];
+; CHECK-NEXT:    ld.param.v4.b32 {%r170, %r171, %r172, %r173}, [nvvm_tcgen05_st_16x256b_param_8+320];
+; CHECK-NEXT:    ld.param.v4.b32 {%r174, %r175, %r176, %r177}, [nvvm_tcgen05_st_16x256b_param_8+304];
+; CHECK-NEXT:    ld.param.v4.b32 {%r178, %r179, %r180, %r181}, [nvvm_tcgen05_st_16x256b_param_8+288];
+; CHECK-NEXT:    ld.param.v4.b32 {%r182, %r183, %r184, %r185}, [nvvm_tcgen05_st_16x256b_param_8+272];
+; CHECK-NEXT:    ld.param.v4.b32 {%r186, %r187, %r188, %r189}, [nvvm_tcgen05_st_16x256b_param_8+256];
+; CHECK-NEXT:    ld.param.v4.b32 {%r190, %r191, %r192, %r193}, [nvvm_tcgen05_st_16x256b_param_8+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r194, %r195, %r196, %r197}, [nvvm_tcgen05_st_16x256b_param_8+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r198, %r199, %r200, %r201}, [nvvm_tcgen05_st_16x256b_param_8+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r202, %r203, %r204, %r205}, [nvvm_tcgen05_st_16x256b_param_8+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r206, %r207, %r208, %r209}, [nvvm_tcgen05_st_16x256b_param_8+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r210, %r211, %r212, %r213}, [nvvm_tcgen05_st_16x256b_param_8+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r214, %r215, %r216, %r217}, [nvvm_tcgen05_st_16x256b_param_8+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r218, %r219, %r220, %r221}, [nvvm_tcgen05_st_16x256b_param_8+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r222, %r223, %r224, %r225}, [nvvm_tcgen05_st_16x256b_param_8+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r226, %r227, %r228, %r229}, [nvvm_tcgen05_st_16x256b_param_8+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r230, %r231, %r232, %r233}, [nvvm_tcgen05_st_16x256b_param_8+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r234, %r235, %r236, %r237}, [nvvm_tcgen05_st_16x256b_param_8+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r238, %r239, %r240, %r241}, [nvvm_tcgen05_st_16x256b_param_8+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r242, %r243, %r244, %r245}, [nvvm_tcgen05_st_16x256b_param_8+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r246, %r247, %r248, %r249}, [nvvm_tcgen05_st_16x256b_param_8+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r250, %r251, %r252, %r253}, [nvvm_tcgen05_st_16x256b_param_8];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x256b.x32.b32 [%r1], {%r250, %r251, %r252, %r253, %r246, %r247, %r248, %r249, %r242, %r243, %r244, %r245, %r238, %r239, %r240, %r241, %r234, %r235, %r236, %r237, %r230, %r231, %r232, %r233, %r226, %r227, %r228, %r229, %r222, %r223, %r224, %r225, %r218, %r219, %r220, %r221, %r214, %r215, %r216, %r217, %r210, %r211, %r212, %r213, %r206, %r207, %r208, %r209, %r202, %r203, %r204, %r205, %r198, %r199, %r200, %r201, %r194, %r195, %r196, %r197, %r190, %r191, %r192, %r193, %r186, %r187, %r188, %r189, %r182, %r183, %r184, %r185, %r178, %r179, %r180, %r181, %r174, %r175, %r176, %r177, %r170, %r171, %r172, %r173, %r166, %r167, %r168, %r169, %r162, %r163, %r164, %r165, %r158, %r159, %r160, %r161, %r154, %r155, %r156, %r157, %r150, %r151, %r152, %r153, %r146, %r147, %r148, %r149, %r142, %r143, %r144, %r145, %r138, %r139, %r140, %r141, %r134, %r135, %r136, %r137, %r130, %r131, %r132, %r133, %r126, %r127, %r128, %r129};
 ; CHECK-NEXT:    ret;
   tail call void @llvm.nvvm.tcgen05.st.16x256b.x1(ptr addrspace(6) %taddr, <4 x i32> %stv4, i1 0)
@@ -495,75 +495,75 @@ define void @nvvm_tcgen05_st_16x256b_unpack(ptr addrspace(6) %taddr, i32 %stv1,
 ; CHECK-NEXT:    .reg .b32 %r<254>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_st_16x256b_unpack_param_0];
-; CHECK-NEXT:    ld.param.v4.u32 {%r2, %r3, %r4, %r5}, [nvvm_tcgen05_st_16x256b_unpack_param_3];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_st_16x256b_unpack_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r2, %r3, %r4, %r5}, [nvvm_tcgen05_st_16x256b_unpack_param_3];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [%r1], {%r2, %r3, %r4, %r5};
-; CHECK-NEXT:    ld.param.v4.u32 {%r6, %r7, %r8, %r9}, [nvvm_tcgen05_st_16x256b_unpack_param_4+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r10, %r11, %r12, %r13}, [nvvm_tcgen05_st_16x256b_unpack_param_4];
+; CHECK-NEXT:    ld.param.v4.b32 {%r6, %r7, %r8, %r9}, [nvvm_tcgen05_st_16x256b_unpack_param_4+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r10, %r11, %r12, %r13}, [nvvm_tcgen05_st_16x256b_unpack_param_4];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [%r1], {%r10, %r11, %r12, %r13, %r6, %r7, %r8, %r9};
-; CHECK-NEXT:    ld.param.v4.u32 {%r14, %r15, %r16, %r17}, [nvvm_tcgen05_st_16x256b_unpack_param_5+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r18, %r19, %r20, %r21}, [nvvm_tcgen05_st_16x256b_unpack_param_5+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r22, %r23, %r24, %r25}, [nvvm_tcgen05_st_16x256b_unpack_param_5+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r26, %r27, %r28, %r29}, [nvvm_tcgen05_st_16x256b_unpack_param_5];
+; CHECK-NEXT:    ld.param.v4.b32 {%r14, %r15, %r16, %r17}, [nvvm_tcgen05_st_16x256b_unpack_param_5+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r18, %r19, %r20, %r21}, [nvvm_tcgen05_st_16x256b_unpack_param_5+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r22, %r23, %r24, %r25}, [nvvm_tcgen05_st_16x256b_unpack_param_5+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r26, %r27, %r28, %r29}, [nvvm_tcgen05_st_16x256b_unpack_param_5];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [%r1], {%r26, %r27, %r28, %r29, %r22, %r23, %r24, %r25, %r18, %r19, %r20, %r21, %r14, %r15, %r16, %r17};
-; CHECK-NEXT:    ld.param.v4.u32 {%r30, %r31, %r32, %r33}, [nvvm_tcgen05_st_16x256b_unpack_param_6+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r34, %r35, %r36, %r37}, [nvvm_tcgen05_st_16x256b_unpack_param_6+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r38, %r39, %r40, %r41}, [nvvm_tcgen05_st_16x256b_unpack_param_6+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r42, %r43, %r44, %r45}, [nvvm_tcgen05_st_16x256b_unpack_param_6+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r46, %r47, %r48, %r49}, [nvvm_tcgen05_st_16x256b_unpack_param_6+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r50, %r51, %r52, %r53}, [nvvm_tcgen05_st_16x256b_unpack_param_6+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r54, %r55, %r56, %r57}, [nvvm_tcgen05_st_16x256b_unpack_param_6+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r58, %r59, %r60, %r61}, [nvvm_tcgen05_st_16x256b_unpack_param_6];
+; CHECK-NEXT:    ld.param.v4.b32 {%r30, %r31, %r32, %r33}, [nvvm_tcgen05_st_16x256b_unpack_param_6+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r34, %r35, %r36, %r37}, [nvvm_tcgen05_st_16x256b_unpack_param_6+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r38, %r39, %r40, %r41}, [nvvm_tcgen05_st_16x256b_unpack_param_6+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r42, %r43, %r44, %r45}, [nvvm_tcgen05_st_16x256b_unpack_param_6+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r46, %r47, %r48, %r49}, [nvvm_tcgen05_st_16x256b_unpack_param_6+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r50, %r51, %r52, %r53}, [nvvm_tcgen05_st_16x256b_unpack_param_6+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r54, %r55, %r56, %r57}, [nvvm_tcgen05_st_16x256b_unpack_param_6+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r58, %r59, %r60, %r61}, [nvvm_tcgen05_st_16x256b_unpack_param_6];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [%r1], {%r58, %r59, %r60, %r61, %r54, %r55, %r56, %r57, %r50, %r51, %r52, %r53, %r46, %r47, %r48, %r49, %r42, %r43, %r44, %r45, %r38, %r39, %r40, %r41, %r34, %r35, %r36, %r37, %r30, %r31, %r32, %r33};
-; CHECK-NEXT:    ld.param.v4.u32 {%r62, %r63, %r64, %r65}, [nvvm_tcgen05_st_16x256b_unpack_param_7+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r66, %r67, %r68, %r69}, [nvvm_tcgen05_st_16x256b_unpack_param_7+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r70, %r71, %r72, %r73}, [nvvm_tcgen05_st_16x256b_unpack_param_7+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r74, %r75, %r76, %r77}, [nvvm_tcgen05_st_16x256b_unpack_param_7+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r78, %r79, %r80, %r81}, [nvvm_tcgen05_st_16x256b_unpack_param_7+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r82, %r83, %r84, %r85}, [nvvm_tcgen05_st_16x256b_unpack_param_7+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r86, %r87, %r88, %r89}, [nvvm_tcgen05_st_16x256b_unpack_param_7+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r90, %r91, %r92, %r93}, [nvvm_tcgen05_st_16x256b_unpack_param_7+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r94, %r95, %r96, %r97}, [nvvm_tcgen05_st_16x256b_unpack_param_7+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r98, %r99, %r100, %r101}, [nvvm_tcgen05_st_16x256b_unpack_param_7+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r102, %r103, %r104, %r105}, [nvvm_tcgen05_st_16x256b_unpack_param_7+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r106, %r107, %r108, %r109}, [nvvm_tcgen05_st_16x256b_unpack_param_7+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r110, %r111, %r112, %r113}, [nvvm_tcgen05_st_16x256b_unpack_param_7+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r114, %r115, %r116, %r117}, [nvvm_tcgen05_st_16x256b_unpack_param_7+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r118, %r119, %r120, %r121}, [nvvm_tcgen05_st_16x256b_unpack_param_7+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r122, %r123, %r124, %r125}, [nvvm_tcgen05_st_16x256b_unpack_param_7];
+; CHECK-NEXT:    ld.param.v4.b32 {%r62, %r63, %r64, %r65}, [nvvm_tcgen05_st_16x256b_unpack_param_7+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r66, %r67, %r68, %r69}, [nvvm_tcgen05_st_16x256b_unpack_param_7+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r70, %r71, %r72, %r73}, [nvvm_tcgen05_st_16x256b_unpack_param_7+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r74, %r75, %r76, %r77}, [nvvm_tcgen05_st_16x256b_unpack_param_7+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r78, %r79, %r80, %r81}, [nvvm_tcgen05_st_16x256b_unpack_param_7+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r82, %r83, %r84, %r85}, [nvvm_tcgen05_st_16x256b_unpack_param_7+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r86, %r87, %r88, %r89}, [nvvm_tcgen05_st_16x256b_unpack_param_7+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r90, %r91, %r92, %r93}, [nvvm_tcgen05_st_16x256b_unpack_param_7+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r94, %r95, %r96, %r97}, [nvvm_tcgen05_st_16x256b_unpack_param_7+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r98, %r99, %r100, %r101}, [nvvm_tcgen05_st_16x256b_unpack_param_7+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r102, %r103, %r104, %r105}, [nvvm_tcgen05_st_16x256b_unpack_param_7+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r106, %r107, %r108, %r109}, [nvvm_tcgen05_st_16x256b_unpack_param_7+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r110, %r111, %r112, %r113}, [nvvm_tcgen05_st_16x256b_unpack_param_7+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r114, %r115, %r116, %r117}, [nvvm_tcgen05_st_16x256b_unpack_param_7+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r118, %r119, %r120, %r121}, [nvvm_tcgen05_st_16x256b_unpack_param_7+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r122, %r123, %r124, %r125}, [nvvm_tcgen05_st_16x256b_unpack_param_7];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [%r1], {%r122, %r123, %r124, %r125, %r118, %r119, %r120, %r121, %r114, %r115, %r116, %r117, %r110, %r111, %r112, %r113, %r106, %r107, %r108, %r109, %r102, %r103, %r104, %r105, %r98, %r99, %r100, %r101, %r94, %r95, %r96, %r97, %r90, %r91, %r92, %r93, %r86, %r87, %r88, %r89, %r82, %r83, %r84, %r85, %r78, %r79, %r80, %r81, %r74, %r75, %r76, %r77, %r70, %r71, %r72, %r73, %r66, %r67, %r68, %r69, %r62, %r63, %r64, %r65};
-; CHECK-NEXT:    ld.param.v4.u32 {%r126, %r127, %r128, %r129}, [nvvm_tcgen05_st_16x256b_unpack_param_8+496];
-; CHECK-NEXT:    ld.param.v4.u32 {%r130, %r131, %r132, %r133}, [nvvm_tcgen05_st_16x256b_unpack_param_8+480];
-; CHECK-NEXT:    ld.param.v4.u32 {%r134, %r135, %r136, %r137}, [nvvm_tcgen05_st_16x256b_unpack_param_8+464];
-; CHECK-NEXT:    ld.param.v4.u32 {%r138, %r139, %r140, %r141}, [nvvm_tcgen05_st_16x256b_unpack_param_8+448];
-; CHECK-NEXT:    ld.param.v4.u32 {%r142, %r143, %r144, %r145}, [nvvm_tcgen05_st_16x256b_unpack_param_8+432];
-; CHECK-NEXT:    ld.param.v4.u32 {%r146, %r147, %r148, %r149}, [nvvm_tcgen05_st_16x256b_unpack_param_8+416];
-; CHECK-NEXT:    ld.param.v4.u32 {%r150, %r151, %r152, %r153}, [nvvm_tcgen05_st_16x256b_unpack_param_8+400];
-; CHECK-NEXT:    ld.param.v4.u32 {%r154, %r155, %r156, %r157}, [nvvm_tcgen05_st_16x256b_unpack_param_8+384];
-; CHECK-NEXT:    ld.param.v4.u32 {%r158, %r159, %r160, %r161}, [nvvm_tcgen05_st_16x256b_unpack_param_8+368];
-; CHECK-NEXT:    ld.param.v4.u32 {%r162, %r163, %r164, %r165}, [nvvm_tcgen05_st_16x256b_unpack_param_8+352];
-; CHECK-NEXT:    ld.param.v4.u32 {%r166, %r167, %r168, %r169}, [nvvm_tcgen05_st_16x256b_unpack_param_8+336];
-; CHECK-NEXT:    ld.param.v4.u32 {%r170, %r171, %r172, %r173}, [nvvm_tcgen05_st_16x256b_unpack_param_8+320];
-; CHECK-NEXT:    ld.param.v4.u32 {%r174, %r175, %r176, %r177}, [nvvm_tcgen05_st_16x256b_unpack_param_8+304];
-; CHECK-NEXT:    ld.param.v4.u32 {%r178, %r179, %r180, %r181}, [nvvm_tcgen05_st_16x256b_unpack_param_8+288];
-; CHECK-NEXT:    ld.param.v4.u32 {%r182, %r183, %r184, %r185}, [nvvm_tcgen05_st_16x256b_unpack_param_8+272];
-; CHECK-NEXT:    ld.param.v4.u32 {%r186, %r187, %r188, %r189}, [nvvm_tcgen05_st_16x256b_unpack_param_8+256];
-; CHECK-NEXT:    ld.param.v4.u32 {%r190, %r191, %r192, %r193}, [nvvm_tcgen05_st_16x256b_unpack_param_8+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r194, %r195, %r196, %r197}, [nvvm_tcgen05_st_16x256b_unpack_param_8+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r198, %r199, %r200, %r201}, [nvvm_tcgen05_st_16x256b_unpack_param_8+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r202, %r203, %r204, %r205}, [nvvm_tcgen05_st_16x256b_unpack_param_8+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r206, %r207, %r208, %r209}, [nvvm_tcgen05_st_16x256b_unpack_param_8+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r210, %r211, %r212, %r213}, [nvvm_tcgen05_st_16x256b_unpack_param_8+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r214, %r215, %r216, %r217}, [nvvm_tcgen05_st_16x256b_unpack_param_8+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r218, %r219, %r220, %r221}, [nvvm_tcgen05_st_16x256b_unpack_param_8+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r222, %r223, %r224, %r225}, [nvvm_tcgen05_st_16x256b_unpack_param_8+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r226, %r227, %r228, %r229}, [nvvm_tcgen05_st_16x256b_unpack_param_8+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r230, %r231, %r232, %r233}, [nvvm_tcgen05_st_16x256b_unpack_param_8+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r234, %r235, %r236, %r237}, [nvvm_tcgen05_st_16x256b_unpack_param_8+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r238, %r239, %r240, %r241}, [nvvm_tcgen05_st_16x256b_unpack_param_8+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r242, %r243, %r244, %r245}, [nvvm_tcgen05_st_16x256b_unpack_param_8+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r246, %r247, %r248, %r249}, [nvvm_tcgen05_st_16x256b_unpack_param_8+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r250, %r251, %r252, %r253}, [nvvm_tcgen05_st_16x256b_unpack_param_8];
+; CHECK-NEXT:    ld.param.v4.b32 {%r126, %r127, %r128, %r129}, [nvvm_tcgen05_st_16x256b_unpack_param_8+496];
+; CHECK-NEXT:    ld.param.v4.b32 {%r130, %r131, %r132, %r133}, [nvvm_tcgen05_st_16x256b_unpack_param_8+480];
+; CHECK-NEXT:    ld.param.v4.b32 {%r134, %r135, %r136, %r137}, [nvvm_tcgen05_st_16x256b_unpack_param_8+464];
+; CHECK-NEXT:    ld.param.v4.b32 {%r138, %r139, %r140, %r141}, [nvvm_tcgen05_st_16x256b_unpack_param_8+448];
+; CHECK-NEXT:    ld.param.v4.b32 {%r142, %r143, %r144, %r145}, [nvvm_tcgen05_st_16x256b_unpack_param_8+432];
+; CHECK-NEXT:    ld.param.v4.b32 {%r146, %r147, %r148, %r149}, [nvvm_tcgen05_st_16x256b_unpack_param_8+416];
+; CHECK-NEXT:    ld.param.v4.b32 {%r150, %r151, %r152, %r153}, [nvvm_tcgen05_st_16x256b_unpack_param_8+400];
+; CHECK-NEXT:    ld.param.v4.b32 {%r154, %r155, %r156, %r157}, [nvvm_tcgen05_st_16x256b_unpack_param_8+384];
+; CHECK-NEXT:    ld.param.v4.b32 {%r158, %r159, %r160, %r161}, [nvvm_tcgen05_st_16x256b_unpack_param_8+368];
+; CHECK-NEXT:    ld.param.v4.b32 {%r162, %r163, %r164, %r165}, [nvvm_tcgen05_st_16x256b_unpack_param_8+352];
+; CHECK-NEXT:    ld.param.v4.b32 {%r166, %r167, %r168, %r169}, [nvvm_tcgen05_st_16x256b_unpack_param_8+336];
+; CHECK-NEXT:    ld.param.v4.b32 {%r170, %r171, %r172, %r173}, [nvvm_tcgen05_st_16x256b_unpack_param_8+320];
+; CHECK-NEXT:    ld.param.v4.b32 {%r174, %r175, %r176, %r177}, [nvvm_tcgen05_st_16x256b_unpack_param_8+304];
+; CHECK-NEXT:    ld.param.v4.b32 {%r178, %r179, %r180, %r181}, [nvvm_tcgen05_st_16x256b_unpack_param_8+288];
+; CHECK-NEXT:    ld.param.v4.b32 {%r182, %r183, %r184, %r185}, [nvvm_tcgen05_st_16x256b_unpack_param_8+272];
+; CHECK-NEXT:    ld.param.v4.b32 {%r186, %r187, %r188, %r189}, [nvvm_tcgen05_st_16x256b_unpack_param_8+256];
+; CHECK-NEXT:    ld.param.v4.b32 {%r190, %r191, %r192, %r193}, [nvvm_tcgen05_st_16x256b_unpack_param_8+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r194, %r195, %r196, %r197}, [nvvm_tcgen05_st_16x256b_unpack_param_8+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r198, %r199, %r200, %r201}, [nvvm_tcgen05_st_16x256b_unpack_param_8+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r202, %r203, %r204, %r205}, [nvvm_tcgen05_st_16x256b_unpack_param_8+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r206, %r207, %r208, %r209}, [nvvm_tcgen05_st_16x256b_unpack_param_8+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r210, %r211, %r212, %r213}, [nvvm_tcgen05_st_16x256b_unpack_param_8+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r214, %r215, %r216, %r217}, [nvvm_tcgen05_st_16x256b_unpack_param_8+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r218, %r219, %r220, %r221}, [nvvm_tcgen05_st_16x256b_unpack_param_8+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r222, %r223, %r224, %r225}, [nvvm_tcgen05_st_16x256b_unpack_param_8+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r226, %r227, %r228, %r229}, [nvvm_tcgen05_st_16x256b_unpack_param_8+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r230, %r231, %r232, %r233}, [nvvm_tcgen05_st_16x256b_unpack_param_8+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r234, %r235, %r236, %r237}, [nvvm_tcgen05_st_16x256b_unpack_param_8+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r238, %r239, %r240, %r241}, [nvvm_tcgen05_st_16x256b_unpack_param_8+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r242, %r243, %r244, %r245}, [nvvm_tcgen05_st_16x256b_unpack_param_8+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r246, %r247, %r248, %r249}, [nvvm_tcgen05_st_16x256b_unpack_param_8+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r250, %r251, %r252, %r253}, [nvvm_tcgen05_st_16x256b_unpack_param_8];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [%r1], {%r250, %r251, %r252, %r253, %r246, %r247, %r248, %r249, %r242, %r243, %r244, %r245, %r238, %r239, %r240, %r241, %r234, %r235, %r236, %r237, %r230, %r231, %r232, %r233, %r226, %r227, %r228, %r229, %r222, %r223, %r224, %r225, %r218, %r219, %r220, %r221, %r214, %r215, %r216, %r217, %r210, %r211, %r212, %r213, %r206, %r207, %r208, %r209, %r202, %r203, %r204, %r205, %r198, %r199, %r200, %r201, %r194, %r195, %r196, %r197, %r190, %r191, %r192, %r193, %r186, %r187, %r188, %r189, %r182, %r183, %r184, %r185, %r178, %r179, %r180, %r181, %r174, %r175, %r176, %r177, %r170, %r171, %r172, %r173, %r166, %r167, %r168, %r169, %r162, %r163, %r164, %r165, %r158, %r159, %r160, %r161, %r154, %r155, %r156, %r157, %r150, %r151, %r152, %r153, %r146, %r147, %r148, %r149, %r142, %r143, %r144, %r145, %r138, %r139, %r140, %r141, %r134, %r135, %r136, %r137, %r130, %r131, %r132, %r133, %r126, %r127, %r128, %r129};
 ; CHECK-NEXT:    ret;
   tail call void @llvm.nvvm.tcgen05.st.16x256b.x1(ptr addrspace(6) %taddr, <4 x i32> %stv4, i1 1)
@@ -587,79 +587,79 @@ define void @nvvm_tcgen05_st_32x32b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i32
 ; CHECK-NEXT:    .reg .b32 %r<257>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_st_32x32b_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [nvvm_tcgen05_st_32x32b_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_st_32x32b_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [nvvm_tcgen05_st_32x32b_param_1];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x1.b32 [%r1], {%r2};
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [nvvm_tcgen05_st_32x32b_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [nvvm_tcgen05_st_32x32b_param_2];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x2.b32 [%r1], {%r3, %r4};
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_32x32b_param_3];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_32x32b_param_3];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x4.b32 [%r1], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT:    ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_32x32b_param_4+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_32x32b_param_4];
+; CHECK-NEXT:    ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_32x32b_param_4+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_32x32b_param_4];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x8.b32 [%r1], {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12};
-; CHECK-NEXT:    ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_32x32b_param_5+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_32x32b_param_5+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_32x32b_param_5+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_32x32b_param_5];
+; CHECK-NEXT:    ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_32x32b_param_5+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_32x32b_param_5+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_32x32b_param_5+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_32x32b_param_5];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x16.b32 [%r1], {%r29, %r30, %r31, %r32, %r25, %r26, %r27, %r28, %r21, %r22, %r23, %r24, %r17, %r18, %r19, %r20};
-; CHECK-NEXT:    ld.param.v4.u32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_32x32b_param_6+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_32x32b_param_6+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_32x32b_param_6+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_32x32b_param_6+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_32x32b_param_6+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_32x32b_param_6+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_32x32b_param_6+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_32x32b_param_6];
+; CHECK-NEXT:    ld.param.v4.b32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_32x32b_param_6+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_32x32b_param_6+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_32x32b_param_6+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_32x32b_param_6+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_32x32b_param_6+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_32x32b_param_6+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_32x32b_param_6+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_32x32b_param_6];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x32.b32 [%r1], {%r61, %r62, %r63, %r64, %r57, %r58, %r59, %r60, %r53, %r54, %r55, %r56, %r49, %r50, %r51, %r52, %r45, %r46, %r47, %r48, %r41, %r42, %r43, %r44, %r37, %r38, %r39, %r40, %r33, %r34, %r35, %r36};
-; CHECK-NEXT:    ld.param.v4.u32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_32x32b_param_7+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_32x32b_param_7+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_32x32b_param_7+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_32x32b_param_7+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_32x32b_param_7+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_32x32b_param_7+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_32x32b_param_7+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_32x32b_param_7+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_32x32b_param_7+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_32x32b_param_7+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_32x32b_param_7+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_32x32b_param_7+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_32x32b_param_7+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_32x32b_param_7+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_32x32b_param_7+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_32x32b_param_7];
+; CHECK-NEXT:    ld.param.v4.b32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_32x32b_param_7+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_32x32b_param_7+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_32x32b_param_7+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_32x32b_param_7+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_32x32b_param_7+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_32x32b_param_7+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_32x32b_param_7+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_32x32b_param_7+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_32x32b_param_7+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_32x32b_param_7+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_32x32b_param_7+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_32x32b_param_7+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_32x32b_param_7+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_32x32b_param_7+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_32x32b_param_7+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_32x32b_param_7];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x64.b32 [%r1], {%r125, %r126, %r127, %r128, %r121, %r122, %r123, %r124, %r117, %r118, %r119, %r120, %r113, %r114, %r115, %r116, %r109, %r110, %r111, %r112, %r105, %r106, %r107, %r108, %r101, %r102, %r103, %r104, %r97, %r98, %r99, %r100, %r93, %r94, %r95, %r96, %r89, %r90, %r91, %r92, %r85, %r86, %r87, %r88, %r81, %r82, %r83, %r84, %r77, %r78, %r79, %r80, %r73, %r74, %r75, %r76, %r69, %r70, %r71, %r72, %r65, %r66, %r67, %r68};
-; CHECK-NEXT:    ld.param.v4.u32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_32x32b_param_8+496];
-; CHECK-NEXT:    ld.param.v4.u32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_32x32b_param_8+480];
-; CHECK-NEXT:    ld.param.v4.u32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_32x32b_param_8+464];
-; CHECK-NEXT:    ld.param.v4.u32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_32x32b_param_8+448];
-; CHECK-NEXT:    ld.param.v4.u32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_32x32b_param_8+432];
-; CHECK-NEXT:    ld.param.v4.u32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_32x32b_param_8+416];
-; CHECK-NEXT:    ld.param.v4.u32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_32x32b_param_8+400];
-; CHECK-NEXT:    ld.param.v4.u32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_32x32b_param_8+384];
-; CHECK-NEXT:    ld.param.v4.u32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_32x32b_param_8+368];
-; CHECK-NEXT:    ld.param.v4.u32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_32x32b_param_8+352];
-; CHECK-NEXT:    ld.param.v4.u32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_32x32b_param_8+336];
-; CHECK-NEXT:    ld.param.v4.u32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_32x32b_param_8+320];
-; CHECK-NEXT:    ld.param.v4.u32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_32x32b_param_8+304];
-; CHECK-NEXT:    ld.param.v4.u32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_32x32b_param_8+288];
-; CHECK-NEXT:    ld.param.v4.u32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_32x32b_param_8+272];
-; CHECK-NEXT:    ld.param.v4.u32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_32x32b_param_8+256];
-; CHECK-NEXT:    ld.param.v4.u32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_32x32b_param_8+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_32x32b_param_8+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_32x32b_param_8+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_32x32b_param_8+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_32x32b_param_8+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_32x32b_param_8+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_32x32b_param_8+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_32x32b_param_8+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_32x32b_param_8+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_32x32b_param_8+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_32x32b_param_8+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_32x32b_param_8+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_32x32b_param_8+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_32x32b_param_8+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_32x32b_param_8+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_32x32b_param_8];
+; CHECK-NEXT:    ld.param.v4.b32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_32x32b_param_8+496];
+; CHECK-NEXT:    ld.param.v4.b32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_32x32b_param_8+480];
+; CHECK-NEXT:    ld.param.v4.b32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_32x32b_param_8+464];
+; CHECK-NEXT:    ld.param.v4.b32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_32x32b_param_8+448];
+; CHECK-NEXT:    ld.param.v4.b32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_32x32b_param_8+432];
+; CHECK-NEXT:    ld.param.v4.b32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_32x32b_param_8+416];
+; CHECK-NEXT:    ld.param.v4.b32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_32x32b_param_8+400];
+; CHECK-NEXT:    ld.param.v4.b32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_32x32b_param_8+384];
+; CHECK-NEXT:    ld.param.v4.b32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_32x32b_param_8+368];
+; CHECK-NEXT:    ld.param.v4.b32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_32x32b_param_8+352];
+; CHECK-NEXT:    ld.param.v4.b32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_32x32b_param_8+336];
+; CHECK-NEXT:    ld.param.v4.b32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_32x32b_param_8+320];
+; CHECK-NEXT:    ld.param.v4.b32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_32x32b_param_8+304];
+; CHECK-NEXT:    ld.param.v4.b32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_32x32b_param_8+288];
+; CHECK-NEXT:    ld.param.v4.b32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_32x32b_param_8+272];
+; CHECK-NEXT:    ld.param.v4.b32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_32x32b_param_8+256];
+; CHECK-NEXT:    ld.param.v4.b32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_32x32b_param_8+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_32x32b_param_8+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_32x32b_param_8+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_32x32b_param_8+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_32x32b_param_8+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_32x32b_param_8+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_32x32b_param_8+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_32x32b_param_8+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_32x32b_param_8+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_32x32b_param_8+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_32x32b_param_8+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_32x32b_param_8+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_32x32b_param_8+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_32x32b_param_8+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_32x32b_param_8+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_32x32b_param_8];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x128.b32 [%r1], {%r253, %r254, %r255, %r256, %r249, %r250, %r251, %r252, %r245, %r246, %r247, %r248, %r241, %r242, %r243, %r244, %r237, %r238, %r239, %r240, %r233, %r234, %r235, %r236, %r229, %r230, %r231, %r232, %r225, %r226, %r227, %r228, %r221, %r222, %r223, %r224, %r217, %r218, %r219, %r220, %r213, %r214, %r215, %r216, %r209, %r210, %r211, %r212, %r205, %r206, %r207, %r208, %r201, %r202, %r203, %r204, %r197, %r198, %r199, %r200, %r193, %r194, %r195, %r196, %r189, %r190, %r191, %r192, %r185, %r186, %r187, %r188, %r181, %r182, %r183, %r184, %r177, %r178, %r179, %r180, %r173, %r174, %r175, %r176, %r169, %r170, %r171, %r172, %r165, %r166, %r167, %r168, %r161, %r162, %r163, %r164, %r157, %r158, %r159, %r160, %r153, %r154, %r155, %r156, %r149, %r150, %r151, %r152, %r145, %r146, %r147, %r148, %r141, %r142, %r143, %r144, %r137, %r138, %r139, %r140, %r133, %r134, %r135, %r136, %r129, %r130, %r131, %r132};
 ; CHECK-NEXT:    ret;
   tail call void @llvm.nvvm.tcgen05.st.32x32b.x1(ptr addrspace(6) %taddr, i32 %stv1, i1 0)
@@ -687,79 +687,79 @@ define void @nvvm_tcgen05_st_32x32b_unpack(ptr addrspace(6) %taddr, i32 %stv1, <
 ; CHECK-NEXT:    .reg .b32 %r<257>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_st_32x32b_unpack_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [nvvm_tcgen05_st_32x32b_unpack_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_st_32x32b_unpack_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [nvvm_tcgen05_st_32x32b_unpack_param_1];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [%r1], {%r2};
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [nvvm_tcgen05_st_32x32b_unpack_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [nvvm_tcgen05_st_32x32b_unpack_param_2];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [%r1], {%r3, %r4};
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_32x32b_unpack_param_3];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_32x32b_unpack_param_3];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [%r1], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT:    ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_32x32b_unpack_param_4+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_32x32b_unpack_param_4];
+; CHECK-NEXT:    ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_32x32b_unpack_param_4+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_32x32b_unpack_param_4];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [%r1], {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12};
-; CHECK-NEXT:    ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_32x32b_unpack_param_5+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_32x32b_unpack_param_5+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_32x32b_unpack_param_5+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_32x32b_unpack_param_5];
+; CHECK-NEXT:    ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_32x32b_unpack_param_5+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_32x32b_unpack_param_5+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_32x32b_unpack_param_5+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_32x32b_unpack_param_5];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [%r1], {%r29, %r30, %r31, %r32, %r25, %r26, %r27, %r28, %r21, %r22, %r23, %r24, %r17, %r18, %r19, %r20};
-; CHECK-NEXT:    ld.param.v4.u32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_32x32b_unpack_param_6+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_32x32b_unpack_param_6+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_32x32b_unpack_param_6+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_32x32b_unpack_param_6+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_32x32b_unpack_param_6+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_32x32b_unpack_param_6+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_32x32b_unpack_param_6+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_32x32b_unpack_param_6];
+; CHECK-NEXT:    ld.param.v4.b32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_32x32b_unpack_param_6+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_32x32b_unpack_param_6+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_32x32b_unpack_param_6+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_32x32b_unpack_param_6+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_32x32b_unpack_param_6+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_32x32b_unpack_param_6+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_32x32b_unpack_param_6+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_32x32b_unpack_param_6];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [%r1], {%r61, %r62, %r63, %r64, %r57, %r58, %r59, %r60, %r53, %r54, %r55, %r56, %r49, %r50, %r51, %r52, %r45, %r46, %r47, %r48, %r41, %r42, %r43, %r44, %r37, %r38, %r39, %r40, %r33, %r34, %r35, %r36};
-; CHECK-NEXT:    ld.param.v4.u32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_32x32b_unpack_param_7+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_32x32b_unpack_param_7+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_32x32b_unpack_param_7+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_32x32b_unpack_param_7+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_32x32b_unpack_param_7+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_32x32b_unpack_param_7+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_32x32b_unpack_param_7+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_32x32b_unpack_param_7+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_32x32b_unpack_param_7+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_32x32b_unpack_param_7+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_32x32b_unpack_param_7+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_32x32b_unpack_param_7+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_32x32b_unpack_param_7+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_32x32b_unpack_param_7+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_32x32b_unpack_param_7+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_32x32b_unpack_param_7];
+; CHECK-NEXT:    ld.param.v4.b32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_32x32b_unpack_param_7+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_32x32b_unpack_param_7+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_32x32b_unpack_param_7+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_32x32b_unpack_param_7+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_32x32b_unpack_param_7+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_32x32b_unpack_param_7+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_32x32b_unpack_param_7+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_32x32b_unpack_param_7+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_32x32b_unpack_param_7+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_32x32b_unpack_param_7+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_32x32b_unpack_param_7+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_32x32b_unpack_param_7+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_32x32b_unpack_param_7+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_32x32b_unpack_param_7+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_32x32b_unpack_param_7+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_32x32b_unpack_param_7];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [%r1], {%r125, %r126, %r127, %r128, %r121, %r122, %r123, %r124, %r117, %r118, %r119, %r120, %r113, %r114, %r115, %r116, %r109, %r110, %r111, %r112, %r105, %r106, %r107, %r108, %r101, %r102, %r103, %r104, %r97, %r98, %r99, %r100, %r93, %r94, %r95, %r96, %r89, %r90, %r91, %r92, %r85, %r86, %r87, %r88, %r81, %r82, %r83, %r84, %r77, %r78, %r79, %r80, %r73, %r74, %r75, %r76, %r69, %r70, %r71, %r72, %r65, %r66, %r67, %r68};
-; CHECK-NEXT:    ld.param.v4.u32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_32x32b_unpack_param_8+496];
-; CHECK-NEXT:    ld.param.v4.u32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_32x32b_unpack_param_8+480];
-; CHECK-NEXT:    ld.param.v4.u32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_32x32b_unpack_param_8+464];
-; CHECK-NEXT:    ld.param.v4.u32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_32x32b_unpack_param_8+448];
-; CHECK-NEXT:    ld.param.v4.u32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_32x32b_unpack_param_8+432];
-; CHECK-NEXT:    ld.param.v4.u32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_32x32b_unpack_param_8+416];
-; CHECK-NEXT:    ld.param.v4.u32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_32x32b_unpack_param_8+400];
-; CHECK-NEXT:    ld.param.v4.u32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_32x32b_unpack_param_8+384];
-; CHECK-NEXT:    ld.param.v4.u32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_32x32b_unpack_param_8+368];
-; CHECK-NEXT:    ld.param.v4.u32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_32x32b_unpack_param_8+352];
-; CHECK-NEXT:    ld.param.v4.u32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_32x32b_unpack_param_8+336];
-; CHECK-NEXT:    ld.param.v4.u32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_32x32b_unpack_param_8+320];
-; CHECK-NEXT:    ld.param.v4.u32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_32x32b_unpack_param_8+304];
-; CHECK-NEXT:    ld.param.v4.u32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_32x32b_unpack_param_8+288];
-; CHECK-NEXT:    ld.param.v4.u32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_32x32b_unpack_param_8+272];
-; CHECK-NEXT:    ld.param.v4.u32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_32x32b_unpack_param_8+256];
-; CHECK-NEXT:    ld.param.v4.u32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_32x32b_unpack_param_8+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_32x32b_unpack_param_8+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_32x32b_unpack_param_8+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_32x32b_unpack_param_8+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_32x32b_unpack_param_8+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_32x32b_unpack_param_8+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_32x32b_unpack_param_8+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_32x32b_unpack_param_8+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_32x32b_unpack_param_8+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_32x32b_unpack_param_8+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_32x32b_unpack_param_8+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_32x32b_unpack_param_8+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_32x32b_unpack_param_8+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_32x32b_unpack_param_8+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_32x32b_unpack_param_8+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_32x32b_unpack_param_8];
+; CHECK-NEXT:    ld.param.v4.b32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_32x32b_unpack_param_8+496];
+; CHECK-NEXT:    ld.param.v4.b32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_32x32b_unpack_param_8+480];
+; CHECK-NEXT:    ld.param.v4.b32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_32x32b_unpack_param_8+464];
+; CHECK-NEXT:    ld.param.v4.b32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_32x32b_unpack_param_8+448];
+; CHECK-NEXT:    ld.param.v4.b32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_32x32b_unpack_param_8+432];
+; CHECK-NEXT:    ld.param.v4.b32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_32x32b_unpack_param_8+416];
+; CHECK-NEXT:    ld.param.v4.b32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_32x32b_unpack_param_8+400];
+; CHECK-NEXT:    ld.param.v4.b32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_32x32b_unpack_param_8+384];
+; CHECK-NEXT:    ld.param.v4.b32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_32x32b_unpack_param_8+368];
+; CHECK-NEXT:    ld.param.v4.b32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_32x32b_unpack_param_8+352];
+; CHECK-NEXT:    ld.param.v4.b32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_32x32b_unpack_param_8+336];
+; CHECK-NEXT:    ld.param.v4.b32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_32x32b_unpack_param_8+320];
+; CHECK-NEXT:    ld.param.v4.b32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_32x32b_unpack_param_8+304];
+; CHECK-NEXT:    ld.param.v4.b32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_32x32b_unpack_param_8+288];
+; CHECK-NEXT:    ld.param.v4.b32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_32x32b_unpack_param_8+272];
+; CHECK-NEXT:    ld.param.v4.b32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_32x32b_unpack_param_8+256];
+; CHECK-NEXT:    ld.param.v4.b32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_32x32b_unpack_param_8+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_32x32b_unpack_param_8+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_32x32b_unpack_param_8+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_32x32b_unpack_param_8+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_32x32b_unpack_param_8+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_32x32b_unpack_param_8+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_32x32b_unpack_param_8+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_32x32b_unpack_param_8+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_32x32b_unpack_param_8+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_32x32b_unpack_param_8+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_32x32b_unpack_param_8+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_32x32b_unpack_param_8+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_32x32b_unpack_param_8+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_32x32b_unpack_param_8+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_32x32b_unpack_param_8+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_32x32b_unpack_param_8];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [%r1], {%r253, %r254, %r255, %r256, %r249, %r250, %r251, %r252, %r245, %r246, %r247, %r248, %r241, %r242, %r243, %r244, %r237, %r238, %r239, %r240, %r233, %r234, %r235, %r236, %r229, %r230, %r231, %r232, %r225, %r226, %r227, %r228, %r221, %r222, %r223, %r224, %r217, %r218, %r219, %r220, %r213, %r214, %r215, %r216, %r209, %r210, %r211, %r212, %r205, %r206, %r207, %r208, %r201, %r202, %r203, %r204, %r197, %r198, %r199, %r200, %r193, %r194, %r195, %r196, %r189, %r190, %r191, %r192, %r185, %r186, %r187, %r188, %r181, %r182, %r183, %r184, %r177, %r178, %r179, %r180, %r173, %r174, %r175, %r176, %r169, %r170, %r171, %r172, %r165, %r166, %r167, %r168, %r161, %r162, %r163, %r164, %r157, %r158, %r159, %r160, %r153, %r154, %r155, %r156, %r149, %r150, %r151, %r152, %r145, %r146, %r147, %r148, %r141, %r142, %r143, %r144, %r137, %r138, %r139, %r140, %r133, %r134, %r135, %r136, %r129, %r130, %r131, %r132};
 ; CHECK-NEXT:    ret;
   tail call void @llvm.nvvm.tcgen05.st.32x32b.x1(ptr addrspace(6) %taddr, i32 %stv1, i1 1)
@@ -787,79 +787,79 @@ define void @nvvm_tcgen05_st_16x32bx2(ptr addrspace(6) %taddr, i32 %stv1, <2 x i
 ; CHECK-NEXT:    .reg .b32 %r<257>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_st_16x32bx2_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [nvvm_tcgen05_st_16x32bx2_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_st_16x32bx2_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [nvvm_tcgen05_st_16x32bx2_param_1];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x1.b32 [%r1], 2, {%r2};
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [nvvm_tcgen05_st_16x32bx2_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [nvvm_tcgen05_st_16x32bx2_param_2];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x2.b32 [%r1], 2, {%r3, %r4};
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x32bx2_param_3];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x32bx2_param_3];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x4.b32 [%r1], 2, {%r5, %r6, %r7, %r8};
-; CHECK-NEXT:    ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x32bx2_param_4+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x32bx2_param_4];
+; CHECK-NEXT:    ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x32bx2_param_4+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x32bx2_param_4];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x8.b32 [%r1], 2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12};
-; CHECK-NEXT:    ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x32bx2_param_5+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x32bx2_param_5+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x32bx2_param_5+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x32bx2_param_5];
+; CHECK-NEXT:    ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x32bx2_param_5+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x32bx2_param_5+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x32bx2_param_5+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x32bx2_param_5];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x16.b32 [%r1], 2, {%r29, %r30, %r31, %r32, %r25, %r26, %r27, %r28, %r21, %r22, %r23, %r24, %r17, %r18, %r19, %r20};
-; CHECK-NEXT:    ld.param.v4.u32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x32bx2_param_6+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x32bx2_param_6+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x32bx2_param_6+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x32bx2_param_6+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x32bx2_param_6+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x32bx2_param_6+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x32bx2_param_6+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x32bx2_param_6];
+; CHECK-NEXT:    ld.param.v4.b32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x32bx2_param_6+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x32bx2_param_6+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x32bx2_param_6+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x32bx2_param_6+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x32bx2_param_6+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x32bx2_param_6+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x32bx2_param_6+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x32bx2_param_6];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x32.b32 [%r1], 2, {%r61, %r62, %r63, %r64, %r57, %r58, %r59, %r60, %r53, %r54, %r55, %r56, %r49, %r50, %r51, %r52, %r45, %r46, %r47, %r48, %r41, %r42, %r43, %r44, %r37, %r38, %r39, %r40, %r33, %r34, %r35, %r36};
-; CHECK-NEXT:    ld.param.v4.u32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x32bx2_param_7+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x32bx2_param_7+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x32bx2_param_7+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x32bx2_param_7+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x32bx2_param_7+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x32bx2_param_7+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x32bx2_param_7+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x32bx2_param_7+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x32bx2_param_7+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x32bx2_param_7+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x32bx2_param_7+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x32bx2_param_7+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x32bx2_param_7+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x32bx2_param_7+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x32bx2_param_7+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x32bx2_param_7];
+; CHECK-NEXT:    ld.param.v4.b32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x32bx2_param_7+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x32bx2_param_7+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x32bx2_param_7+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x32bx2_param_7+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x32bx2_param_7+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x32bx2_param_7+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x32bx2_param_7+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x32bx2_param_7+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x32bx2_param_7+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x32bx2_param_7+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x32bx2_param_7+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x32bx2_param_7+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x32bx2_param_7+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x32bx2_param_7+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x32bx2_param_7+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x32bx2_param_7];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x64.b32 [%r1], 2, {%r125, %r126, %r127, %r128, %r121, %r122, %r123, %r124, %r117, %r118, %r119, %r120, %r113, %r114, %r115, %r116, %r109, %r110, %r111, %r112, %r105, %r106, %r107, %r108, %r101, %r102, %r103, %r104, %r97, %r98, %r99, %r100, %r93, %r94, %r95, %r96, %r89, %r90, %r91, %r92, %r85, %r86, %r87, %r88, %r81, %r82, %r83, %r84, %r77, %r78, %r79, %r80, %r73, %r74, %r75, %r76, %r69, %r70, %r71, %r72, %r65, %r66, %r67, %r68};
-; CHECK-NEXT:    ld.param.v4.u32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x32bx2_param_8+496];
-; CHECK-NEXT:    ld.param.v4.u32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x32bx2_param_8+480];
-; CHECK-NEXT:    ld.param.v4.u32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x32bx2_param_8+464];
-; CHECK-NEXT:    ld.param.v4.u32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x32bx2_param_8+448];
-; CHECK-NEXT:    ld.param.v4.u32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x32bx2_param_8+432];
-; CHECK-NEXT:    ld.param.v4.u32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x32bx2_param_8+416];
-; CHECK-NEXT:    ld.param.v4.u32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x32bx2_param_8+400];
-; CHECK-NEXT:    ld.param.v4.u32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x32bx2_param_8+384];
-; CHECK-NEXT:    ld.param.v4.u32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x32bx2_param_8+368];
-; CHECK-NEXT:    ld.param.v4.u32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x32bx2_param_8+352];
-; CHECK-NEXT:    ld.param.v4.u32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x32bx2_param_8+336];
-; CHECK-NEXT:    ld.param.v4.u32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x32bx2_param_8+320];
-; CHECK-NEXT:    ld.param.v4.u32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x32bx2_param_8+304];
-; CHECK-NEXT:    ld.param.v4.u32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x32bx2_param_8+288];
-; CHECK-NEXT:    ld.param.v4.u32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x32bx2_param_8+272];
-; CHECK-NEXT:    ld.param.v4.u32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x32bx2_param_8+256];
-; CHECK-NEXT:    ld.param.v4.u32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x32bx2_param_8+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x32bx2_param_8+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x32bx2_param_8+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x32bx2_param_8+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x32bx2_param_8+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x32bx2_param_8+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x32bx2_param_8+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x32bx2_param_8+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x32bx2_param_8+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x32bx2_param_8+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x32bx2_param_8+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x32bx2_param_8+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x32bx2_param_8+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x32bx2_param_8+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x32bx2_param_8+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x32bx2_param_8];
+; CHECK-NEXT:    ld.param.v4.b32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x32bx2_param_8+496];
+; CHECK-NEXT:    ld.param.v4.b32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x32bx2_param_8+480];
+; CHECK-NEXT:    ld.param.v4.b32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x32bx2_param_8+464];
+; CHECK-NEXT:    ld.param.v4.b32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x32bx2_param_8+448];
+; CHECK-NEXT:    ld.param.v4.b32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x32bx2_param_8+432];
+; CHECK-NEXT:    ld.param.v4.b32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x32bx2_param_8+416];
+; CHECK-NEXT:    ld.param.v4.b32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x32bx2_param_8+400];
+; CHECK-NEXT:    ld.param.v4.b32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x32bx2_param_8+384];
+; CHECK-NEXT:    ld.param.v4.b32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x32bx2_param_8+368];
+; CHECK-NEXT:    ld.param.v4.b32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x32bx2_param_8+352];
+; CHECK-NEXT:    ld.param.v4.b32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x32bx2_param_8+336];
+; CHECK-NEXT:    ld.param.v4.b32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x32bx2_param_8+320];
+; CHECK-NEXT:    ld.param.v4.b32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x32bx2_param_8+304];
+; CHECK-NEXT:    ld.param.v4.b32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x32bx2_param_8+288];
+; CHECK-NEXT:    ld.param.v4.b32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x32bx2_param_8+272];
+; CHECK-NEXT:    ld.param.v4.b32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x32bx2_param_8+256];
+; CHECK-NEXT:    ld.param.v4.b32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x32bx2_param_8+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x32bx2_param_8+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x32bx2_param_8+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x32bx2_param_8+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x32bx2_param_8+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x32bx2_param_8+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x32bx2_param_8+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x32bx2_param_8+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x32bx2_param_8+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x32bx2_param_8+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x32bx2_param_8+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x32bx2_param_8+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x32bx2_param_8+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x32bx2_param_8+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x32bx2_param_8+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x32bx2_param_8];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x128.b32 [%r1], 2, {%r253, %r254, %r255, %r256, %r249, %r250, %r251, %r252, %r245, %r246, %r247, %r248, %r241, %r242, %r243, %r244, %r237, %r238, %r239, %r240, %r233, %r234, %r235, %r236, %r229, %r230, %r231, %r232, %r225, %r226, %r227, %r228, %r221, %r222, %r223, %r224, %r217, %r218, %r219, %r220, %r213, %r214, %r215, %r216, %r209, %r210, %r211, %r212, %r205, %r206, %r207, %r208, %r201, %r202, %r203, %r204, %r197, %r198, %r199, %r200, %r193, %r194, %r195, %r196, %r189, %r190, %r191, %r192, %r185, %r186, %r187, %r188, %r181, %r182, %r183, %r184, %r177, %r178, %r179, %r180, %r173, %r174, %r175, %r176, %r169, %r170, %r171, %r172, %r165, %r166, %r167, %r168, %r161, %r162, %r163, %r164, %r157, %r158, %r159, %r160, %r153, %r154, %r155, %r156, %r149, %r150, %r151, %r152, %r145, %r146, %r147, %r148, %r141, %r142, %r143, %r144, %r137, %r138, %r139, %r140, %r133, %r134, %r135, %r136, %r129, %r130, %r131, %r132};
 ; CHECK-NEXT:    ret;
   tail call void @llvm.nvvm.tcgen05.st.16x32bx2.x1(ptr addrspace(6) %taddr, i64 2, i32 %stv1, i1 0)
@@ -887,79 +887,79 @@ define void @nvvm_tcgen05_st_16x32bx2_unpack(ptr addrspace(6) %taddr, i32 %stv1,
 ; CHECK-NEXT:    .reg .b32 %r<257>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [nvvm_tcgen05_st_16x32bx2_unpack_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [nvvm_tcgen05_st_16x32bx2_unpack_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [nvvm_tcgen05_st_16x32bx2_unpack_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [nvvm_tcgen05_st_16x32bx2_unpack_param_1];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [%r1], 2, {%r2};
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [nvvm_tcgen05_st_16x32bx2_unpack_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [nvvm_tcgen05_st_16x32bx2_unpack_param_2];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [%r1], 2, {%r3, %r4};
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x32bx2_unpack_param_3];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x32bx2_unpack_param_3];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [%r1], 2, {%r5, %r6, %r7, %r8};
-; CHECK-NEXT:    ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x32bx2_unpack_param_4+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x32bx2_unpack_param_4];
+; CHECK-NEXT:    ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x32bx2_unpack_param_4+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x32bx2_unpack_param_4];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [%r1], 2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12};
-; CHECK-NEXT:    ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5];
+; CHECK-NEXT:    ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [%r1], 2, {%r29, %r30, %r31, %r32, %r25, %r26, %r27, %r28, %r21, %r22, %r23, %r24, %r17, %r18, %r19, %r20};
-; CHECK-NEXT:    ld.param.v4.u32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6];
+; CHECK-NEXT:    ld.param.v4.b32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [%r1], 2, {%r61, %r62, %r63, %r64, %r57, %r58, %r59, %r60, %r53, %r54, %r55, %r56, %r49, %r50, %r51, %r52, %r45, %r46, %r47, %r48, %r41, %r42, %r43, %r44, %r37, %r38, %r39, %r40, %r33, %r34, %r35, %r36};
-; CHECK-NEXT:    ld.param.v4.u32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7];
+; CHECK-NEXT:    ld.param.v4.b32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [%r1], 2, {%r125, %r126, %r127, %r128, %r121, %r122, %r123, %r124, %r117, %r118, %r119, %r120, %r113, %r114, %r115, %r116, %r109, %r110, %r111, %r112, %r105, %r106, %r107, %r108, %r101, %r102, %r103, %r104, %r97, %r98, %r99, %r100, %r93, %r94, %r95, %r96, %r89, %r90, %r91, %r92, %r85, %r86, %r87, %r88, %r81, %r82, %r83, %r84, %r77, %r78, %r79, %r80, %r73, %r74, %r75, %r76, %r69, %r70, %r71, %r72, %r65, %r66, %r67, %r68};
-; CHECK-NEXT:    ld.param.v4.u32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+496];
-; CHECK-NEXT:    ld.param.v4.u32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+480];
-; CHECK-NEXT:    ld.param.v4.u32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+464];
-; CHECK-NEXT:    ld.param.v4.u32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+448];
-; CHECK-NEXT:    ld.param.v4.u32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+432];
-; CHECK-NEXT:    ld.param.v4.u32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+416];
-; CHECK-NEXT:    ld.param.v4.u32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+400];
-; CHECK-NEXT:    ld.param.v4.u32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+384];
-; CHECK-NEXT:    ld.param.v4.u32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+368];
-; CHECK-NEXT:    ld.param.v4.u32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+352];
-; CHECK-NEXT:    ld.param.v4.u32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+336];
-; CHECK-NEXT:    ld.param.v4.u32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+320];
-; CHECK-NEXT:    ld.param.v4.u32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+304];
-; CHECK-NEXT:    ld.param.v4.u32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+288];
-; CHECK-NEXT:    ld.param.v4.u32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+272];
-; CHECK-NEXT:    ld.param.v4.u32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+256];
-; CHECK-NEXT:    ld.param.v4.u32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+240];
-; CHECK-NEXT:    ld.param.v4.u32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+224];
-; CHECK-NEXT:    ld.param.v4.u32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+208];
-; CHECK-NEXT:    ld.param.v4.u32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+192];
-; CHECK-NEXT:    ld.param.v4.u32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+176];
-; CHECK-NEXT:    ld.param.v4.u32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+160];
-; CHECK-NEXT:    ld.param.v4.u32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+144];
-; CHECK-NEXT:    ld.param.v4.u32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+128];
-; CHECK-NEXT:    ld.param.v4.u32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+112];
-; CHECK-NEXT:    ld.param.v4.u32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+96];
-; CHECK-NEXT:    ld.param.v4.u32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+80];
-; CHECK-NEXT:    ld.param.v4.u32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+64];
-; CHECK-NEXT:    ld.param.v4.u32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+48];
-; CHECK-NEXT:    ld.param.v4.u32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+32];
-; CHECK-NEXT:    ld.param.v4.u32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+16];
-; CHECK-NEXT:    ld.param.v4.u32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8];
+; CHECK-NEXT:    ld.param.v4.b32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+496];
+; CHECK-NEXT:    ld.param.v4.b32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+480];
+; CHECK-NEXT:    ld.param.v4.b32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+464];
+; CHECK-NEXT:    ld.param.v4.b32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+448];
+; CHECK-NEXT:    ld.param.v4.b32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+432];
+; CHECK-NEXT:    ld.param.v4.b32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+416];
+; CHECK-NEXT:    ld.param.v4.b32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+400];
+; CHECK-NEXT:    ld.param.v4.b32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+384];
+; CHECK-NEXT:    ld.param.v4.b32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+368];
+; CHECK-NEXT:    ld.param.v4.b32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+352];
+; CHECK-NEXT:    ld.param.v4.b32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+336];
+; CHECK-NEXT:    ld.param.v4.b32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+320];
+; CHECK-NEXT:    ld.param.v4.b32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+304];
+; CHECK-NEXT:    ld.param.v4.b32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+288];
+; CHECK-NEXT:    ld.param.v4.b32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+272];
+; CHECK-NEXT:    ld.param.v4.b32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+256];
+; CHECK-NEXT:    ld.param.v4.b32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+240];
+; CHECK-NEXT:    ld.param.v4.b32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+224];
+; CHECK-NEXT:    ld.param.v4.b32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+208];
+; CHECK-NEXT:    ld.param.v4.b32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+192];
+; CHECK-NEXT:    ld.param.v4.b32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+176];
+; CHECK-NEXT:    ld.param.v4.b32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+160];
+; CHECK-NEXT:    ld.param.v4.b32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+144];
+; CHECK-NEXT:    ld.param.v4.b32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+128];
+; CHECK-NEXT:    ld.param.v4.b32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+112];
+; CHECK-NEXT:    ld.param.v4.b32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+96];
+; CHECK-NEXT:    ld.param.v4.b32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+80];
+; CHECK-NEXT:    ld.param.v4.b32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+64];
+; CHECK-NEXT:    ld.param.v4.b32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+48];
+; CHECK-NEXT:    ld.param.v4.b32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+32];
+; CHECK-NEXT:    ld.param.v4.b32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+16];
+; CHECK-NEXT:    ld.param.v4.b32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8];
 ; CHECK-NEXT:    tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [%r1], 2, {%r253, %r254, %r255, %r256, %r249, %r250, %r251, %r252, %r245, %r246, %r247, %r248, %r241, %r242, %r243, %r244, %r237, %r238, %r239, %r240, %r233, %r234, %r235, %r236, %r229, %r230, %r231, %r232, %r225, %r226, %r227, %r228, %r221, %r222, %r223, %r224, %r217, %r218, %r219, %r220, %r213, %r214, %r215, %r216, %r209, %r210, %r211, %r212, %r205, %r206, %r207, %r208, %r201, %r202, %r203, %r204, %r197, %r198, %r199, %r200, %r193, %r194, %r195, %r196, %r189, %r190, %r191, %r192, %r185, %r186, %r187, %r188, %r181, %r182, %r183, %r184, %r177, %r178, %r179, %r180, %r173, %r174, %r175, %r176, %r169, %r170, %r171, %r172, %r165, %r166, %r167, %r168, %r161, %r162, %r163, %r164, %r157, %r158, %r159, %r160, %r153, %r154, %r155, %r156, %r149, %r150, %r151, %r152, %r145, %r146, %r147, %r148, %r141, %r142, %r143, %r144, %r137, %r138, %r139, %r140, %r133, %r134, %r135, %r136, %r129, %r130, %r131, %r132};
 ; CHECK-NEXT:    ret;
   tail call void @llvm.nvvm.tcgen05.st.16x32bx2.x1(ptr addrspace(6) %taddr, i64 2, i32 %stv1, i1 1)
diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
index 4e4e3f3aaec62..3d6489a2340da 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -18,12 +18,12 @@ define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [foo_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [foo_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [foo_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [foo_param_1];
 ; CHECK-NEXT:    cvta.to.global.u64 %rd3, %rd2;
-; CHECK-NEXT:    ld.param.u32 %r1, [foo_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [foo_param_2];
 ; CHECK-NEXT:    tex.1d.v4.f32.s32 {%f1, %f2, %f3, %f4}, [%rd1, {%r1}];
-; CHECK-NEXT:    st.global.f32 [%rd3], %f1;
+; CHECK-NEXT:    st.global.b32 [%rd3], %f1;
 ; CHECK-NEXT:    ret;
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %img, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
@@ -42,11 +42,11 @@ define ptx_kernel void @bar(ptr %red, i32 %idx) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [bar_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [bar_param_0];
 ; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
-; CHECK-NEXT:    ld.param.u32 %r1, [bar_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [bar_param_1];
 ; CHECK-NEXT:    tex.1d.v4.f32.s32 {%f1, %f2, %f3, %f4}, [tex0, {%r1}];
-; CHECK-NEXT:    st.global.f32 [%rd2], %f1;
+; CHECK-NEXT:    st.global.b32 [%rd2], %f1;
 ; CHECK-NEXT:    ret;
   %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @tex0)
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx)
@@ -65,9 +65,9 @@ define ptx_kernel void @baz(ptr %red, i32 %idx) {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [baz_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [baz_param_0];
 ; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
-; CHECK-NEXT:    ld.param.u32 %r1, [baz_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [baz_param_1];
 ; CHECK-NEXT:    mov.u64 %rd3, tex0;
 ; CHECK-NEXT:    tex.1d.v4.f32.s32 {%f1, %f2, %f3, %f4}, [tex0, {%r1}];
 ; CHECK-NEXT:    { // callseq 0, 0
@@ -79,10 +79,10 @@ define ptx_kernel void @baz(ptr %red, i32 %idx) {
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
-; CHECK-NEXT:    ld.param.f32 %f5, [retval0];
+; CHECK-NEXT:    ld.param.b32 %f5, [retval0];
 ; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    add.rn.f32 %f7, %f1, %f5;
-; CHECK-NEXT:    st.global.f32 [%rd2], %f7;
+; CHECK-NEXT:    st.global.b32 [%rd2], %f7;
 ; CHECK-NEXT:    ret;
   %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @tex0)
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx)
diff --git a/llvm/test/CodeGen/NVPTX/tex-read.ll b/llvm/test/CodeGen/NVPTX/tex-read.ll
index d74c89f5abc8d..22116b2fafc39 100644
--- a/llvm/test/CodeGen/NVPTX/tex-read.ll
+++ b/llvm/test/CodeGen/NVPTX/tex-read.ll
@@ -10,7 +10,7 @@ define ptx_kernel void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) {
 ; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
   %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64 %img, i64 %sampler, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
-; CHECK: st.f32 [%rd{{[0-9]+}}], %f[[RED]]
+; CHECK: st.b32 [%rd{{[0-9]+}}], %f[[RED]]
   store float %ret, ptr %red
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll
index c9f9ccca82c6f..4edbec48e6bec 100644
--- a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll
+++ b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll
@@ -23,7 +23,7 @@ define i32 @t0(i64 %texHandle) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [t0_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [t0_param_0];
 ; CHECK-NEXT:    txq.width.b32 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -54,7 +54,7 @@ define i32 @t2(i64 %texHandle) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [t2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [t2_param_0];
 ; CHECK-NEXT:    txq.height.b32 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -85,7 +85,7 @@ define i32 @s0(i64 %surfHandle) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [s0_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [s0_param_0];
 ; CHECK-NEXT:    suq.width.b32 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
@@ -116,7 +116,7 @@ define i32 @s2(i64 %surfHandle) {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [s2_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [s2_param_0];
 ; CHECK-NEXT:    suq.height.b32 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
index ff74df124b41f..82ebb0ca57377 100644
--- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -22,9 +22,9 @@
 ; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[16])
 ; CHECK-LABEL: test_s_i8i16p(
 ; CHECK:        .param .align 8 .b8 test_s_i8i16p_param_0[16]
-; CHECK-DAG:    ld.param.u16 [[P0:%rs[0-9]+]],   [test_s_i8i16p_param_0];
-; CHECK-DAG:    ld.param.u8 [[P2_0:%rs[0-9]+]],   [test_s_i8i16p_param_0+3];
-; CHECK-DAG:    ld.param.u8 [[P2_1:%rs[0-9]+]],   [test_s_i8i16p_param_0+4];
+; CHECK-DAG:    ld.param.b16 [[P0:%rs[0-9]+]],   [test_s_i8i16p_param_0];
+; CHECK-DAG:    ld.param.b8 [[P2_0:%rs[0-9]+]],   [test_s_i8i16p_param_0+3];
+; CHECK-DAG:    ld.param.b8 [[P2_1:%rs[0-9]+]],   [test_s_i8i16p_param_0+4];
 ; CHECK-DAG:    shl.b16     [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8;
 ; CHECK-DAG:    or.b16      [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]];
 ; CHECK:        { // callseq
@@ -59,11 +59,11 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
 ; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
 ; CHECK-LABEL: test_s_i8i32p(
 ; CHECK:        .param .align 8 .b8 test_s_i8i32p_param_0[24]
-; CHECK-DAG:    ld.param.u32 [[P0:%r[0-9]+]],   [test_s_i8i32p_param_0];
-; CHECK-DAG:    ld.param.u8 [[P2_0:%r[0-9]+]],   [test_s_i8i32p_param_0+5];
-; CHECK-DAG:    ld.param.u8 [[P2_1:%r[0-9]+]],   [test_s_i8i32p_param_0+6];
-; CHECK-DAG:    ld.param.u8 [[P2_2:%r[0-9]+]],   [test_s_i8i32p_param_0+7];
-; CHECK-DAG:    ld.param.u8 [[P2_3:%r[0-9]+]],   [test_s_i8i32p_param_0+8];
+; CHECK-DAG:    ld.param.b32 [[P0:%r[0-9]+]],   [test_s_i8i32p_param_0];
+; CHECK-DAG:    ld.param.b8 [[P2_0:%r[0-9]+]],   [test_s_i8i32p_param_0+5];
+; CHECK-DAG:    ld.param.b8 [[P2_1:%r[0-9]+]],   [test_s_i8i32p_param_0+6];
+; CHECK-DAG:    ld.param.b8 [[P2_2:%r[0-9]+]],   [test_s_i8i32p_param_0+7];
+; CHECK-DAG:    ld.param.b8 [[P2_3:%r[0-9]+]],   [test_s_i8i32p_param_0+8];
 ; CHECK-DAG:    shl.b32     [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
 ; CHECK-DAG:    shl.b32     [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
 ; CHECK-DAG:    shl.b32     [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
@@ -106,15 +106,15 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
 ; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[32])
 ; CHECK-LABEL: test_s_i8i64p(
 ; CHECK:        .param .align 8 .b8 test_s_i8i64p_param_0[32]
-; CHECK-DAG:    ld.param.u64 [[P0:%rd[0-9]+]],   [test_s_i8i64p_param_0];
-; CHECK-DAG:    ld.param.u8 [[P2_0:%rd[0-9]+]],   [test_s_i8i64p_param_0+9];
-; CHECK-DAG:    ld.param.u8 [[P2_1:%rd[0-9]+]],   [test_s_i8i64p_param_0+10];
-; CHECK-DAG:    ld.param.u8 [[P2_2:%rd[0-9]+]],   [test_s_i8i64p_param_0+11];
-; CHECK-DAG:    ld.param.u8 [[P2_3:%rd[0-9]+]],   [test_s_i8i64p_param_0+12];
-; CHECK-DAG:    ld.param.u8 [[P2_4:%rd[0-9]+]],   [test_s_i8i64p_param_0+13];
-; CHECK-DAG:    ld.param.u8 [[P2_5:%rd[0-9]+]],   [test_s_i8i64p_param_0+14];
-; CHECK-DAG:    ld.param.u8 [[P2_6:%rd[0-9]+]],   [test_s_i8i64p_param_0+15];
-; CHECK-DAG:    ld.param.u8 [[P2_7:%rd[0-9]+]],   [test_s_i8i64p_param_0+16];
+; CHECK-DAG:    ld.param.b64 [[P0:%rd[0-9]+]],   [test_s_i8i64p_param_0];
+; CHECK-DAG:    ld.param.b8 [[P2_0:%rd[0-9]+]],   [test_s_i8i64p_param_0+9];
+; CHECK-DAG:    ld.param.b8 [[P2_1:%rd[0-9]+]],   [test_s_i8i64p_param_0+10];
+; CHECK-DAG:    ld.param.b8 [[P2_2:%rd[0-9]+]],   [test_s_i8i64p_param_0+11];
+; CHECK-DAG:    ld.param.b8 [[P2_3:%rd[0-9]+]],   [test_s_i8i64p_param_0+12];
+; CHECK-DAG:    ld.param.b8 [[P2_4:%rd[0-9]+]],   [test_s_i8i64p_param_0+13];
+; CHECK-DAG:    ld.param.b8 [[P2_5:%rd[0-9]+]],   [test_s_i8i64p_param_0+14];
+; CHECK-DAG:    ld.param.b8 [[P2_6:%rd[0-9]+]],   [test_s_i8i64p_param_0+15];
+; CHECK-DAG:    ld.param.b8 [[P2_7:%rd[0-9]+]],   [test_s_i8i64p_param_0+16];
 ; CHECK-DAG:    shl.b64      [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8;
 ; CHECK-DAG:    shl.b64      [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16;
 ; CHECK-DAG:    shl.b64      [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24;
@@ -182,8 +182,8 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
 ; CHECK-LABEL: test_s_i8f16p(
 ; CHECK:        .param .align 8 .b8 test_s_i8f16p_param_0[16]
 ; CHECK-DAG:    ld.param.b16 [[P0:%rs[0-9]+]],     [test_s_i8f16p_param_0];
-; CHECK-DAG:    ld.param.u8  [[P2_0:%rs[0-9]+]],   [test_s_i8f16p_param_0+3];
-; CHECK-DAG:    ld.param.u8  [[P2_1:%rs[0-9]+]],   [test_s_i8f16p_param_0+4];
+; CHECK-DAG:    ld.param.b8  [[P2_0:%rs[0-9]+]],   [test_s_i8f16p_param_0+3];
+; CHECK-DAG:    ld.param.b8  [[P2_1:%rs[0-9]+]],   [test_s_i8f16p_param_0+4];
 ; CHECK-DAG:    shl.b16      [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8;
 ; CHECK-DAG:    or.b16       [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]];
 ; CHECK:        { // callseq
@@ -219,10 +219,10 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
 ; CHECK-LABEL: test_s_i8f16x2p(
 ; CHECK:        .param .align 8 .b8 test_s_i8f16x2p_param_0[24]
 ; CHECK-DAG:    ld.param.b32 [[P0:%r[0-9]+]],  [test_s_i8f16x2p_param_0];
-; CHECK-DAG:    ld.param.u8  [[P2_0:%r[0-9]+]],   [test_s_i8f16x2p_param_0+5];
-; CHECK-DAG:    ld.param.u8  [[P2_1:%r[0-9]+]],   [test_s_i8f16x2p_param_0+6];
-; CHECK-DAG:    ld.param.u8  [[P2_2:%r[0-9]+]],   [test_s_i8f16x2p_param_0+7];
-; CHECK-DAG:    ld.param.u8  [[P2_3:%r[0-9]+]],   [test_s_i8f16x2p_param_0+8];
+; CHECK-DAG:    ld.param.b8  [[P2_0:%r[0-9]+]],   [test_s_i8f16x2p_param_0+5];
+; CHECK-DAG:    ld.param.b8  [[P2_1:%r[0-9]+]],   [test_s_i8f16x2p_param_0+6];
+; CHECK-DAG:    ld.param.b8  [[P2_2:%r[0-9]+]],   [test_s_i8f16x2p_param_0+7];
+; CHECK-DAG:    ld.param.b8  [[P2_3:%r[0-9]+]],   [test_s_i8f16x2p_param_0+8];
 ; CHECK-DAG:    shl.b32      [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
 ; CHECK-DAG:    shl.b32      [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
 ; CHECK-DAG:    shl.b32      [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
@@ -265,11 +265,11 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
 ; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
 ; CHECK-LABEL: test_s_i8f32p(
 ; CHECK:        .param .align 8 .b8 test_s_i8f32p_param_0[24]
-; CHECK-DAG:    ld.param.f32 [[P0:%f[0-9]+]],    [test_s_i8f32p_param_0];
-; CHECK-DAG:    ld.param.u8  [[P2_0:%r[0-9]+]],   [test_s_i8f32p_param_0+5];
-; CHECK-DAG:    ld.param.u8  [[P2_1:%r[0-9]+]],   [test_s_i8f32p_param_0+6];
-; CHECK-DAG:    ld.param.u8  [[P2_2:%r[0-9]+]],   [test_s_i8f32p_param_0+7];
-; CHECK-DAG:    ld.param.u8  [[P2_3:%r[0-9]+]],   [test_s_i8f32p_param_0+8];
+; CHECK-DAG:    ld.param.b32 [[P0:%f[0-9]+]],    [test_s_i8f32p_param_0];
+; CHECK-DAG:    ld.param.b8  [[P2_0:%r[0-9]+]],   [test_s_i8f32p_param_0+5];
+; CHECK-DAG:    ld.param.b8  [[P2_1:%r[0-9]+]],   [test_s_i8f32p_param_0+6];
+; CHECK-DAG:    ld.param.b8  [[P2_2:%r[0-9]+]],   [test_s_i8f32p_param_0+7];
+; CHECK-DAG:    ld.param.b8  [[P2_3:%r[0-9]+]],   [test_s_i8f32p_param_0+8];
 ; CHECK-DAG:    shl.b32      [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
 ; CHECK-DAG:    shl.b32      [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
 ; CHECK-DAG:    shl.b32      [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
@@ -280,7 +280,7 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
 ; CHECK-DAG:    shr.u32      [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
 ; CHECK:        { // callseq
 ; CHECK-DAG:    .param .align 8 .b8 param0[24];
-; CHECK-DAG:    st.param.f32 [param0], [[P0]];
+; CHECK-DAG:    st.param.b32 [param0], [[P0]];
 ; CHECK-DAG:    st.param.b8  [param0+5], [[P2]];
 ; CHECK-DAG:    st.param.b8  [param0+6], [[P2_1_shr]];
 ; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
@@ -291,13 +291,13 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
 ; CHECK-NEXT:   (
 ; CHECK-NEXT:   param0
 ; CHECK-NEXT:   );
-; CHECK-DAG:    ld.param.f32 [[R0:%f[0-9]+]],    [retval0];
+; CHECK-DAG:    ld.param.b32 [[R0:%f[0-9]+]],    [retval0];
 ; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
 ; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
 ; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
 ; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+8];
 ; CHECK:        } // callseq
-; CHECK-DAG:    st.param.f32 [func_retval0], [[R0]];
+; CHECK-DAG:    st.param.b32 [func_retval0], [[R0]];
 ; CHECK-DAG:    st.param.b8  [func_retval0+5],
 ; CHECK-DAG:    st.param.b8  [func_retval0+6],
 ; CHECK-DAG:    st.param.b8  [func_retval0+7],
@@ -312,15 +312,15 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
 ; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[32])
 ; CHECK-LABEL: test_s_i8f64p(
 ; CHECK:        .param .align 8 .b8 test_s_i8f64p_param_0[32]
-; CHECK-DAG:    ld.param.f64 [[P0:%fd[0-9]+]],    [test_s_i8f64p_param_0];
-; CHECK-DAG:    ld.param.u8  [[P2_0:%rd[0-9]+]],   [test_s_i8f64p_param_0+9];
-; CHECK-DAG:    ld.param.u8  [[P2_1:%rd[0-9]+]],   [test_s_i8f64p_param_0+10];
-; CHECK-DAG:    ld.param.u8  [[P2_2:%rd[0-9]+]],   [test_s_i8f64p_param_0+11];
-; CHECK-DAG:    ld.param.u8  [[P2_3:%rd[0-9]+]],   [test_s_i8f64p_param_0+12];
-; CHECK-DAG:    ld.param.u8  [[P2_4:%rd[0-9]+]],   [test_s_i8f64p_param_0+13];
-; CHECK-DAG:    ld.param.u8  [[P2_5:%rd[0-9]+]],   [test_s_i8f64p_param_0+14];
-; CHECK-DAG:    ld.param.u8  [[P2_6:%rd[0-9]+]],   [test_s_i8f64p_param_0+15];
-; CHECK-DAG:    ld.param.u8  [[P2_7:%rd[0-9]+]],   [test_s_i8f64p_param_0+16];
+; CHECK-DAG:    ld.param.b64 [[P0:%fd[0-9]+]],    [test_s_i8f64p_param_0];
+; CHECK-DAG:    ld.param.b8  [[P2_0:%rd[0-9]+]],   [test_s_i8f64p_param_0+9];
+; CHECK-DAG:    ld.param.b8  [[P2_1:%rd[0-9]+]],   [test_s_i8f64p_param_0+10];
+; CHECK-DAG:    ld.param.b8  [[P2_2:%rd[0-9]+]],   [test_s_i8f64p_param_0+11];
+; CHECK-DAG:    ld.param.b8  [[P2_3:%rd[0-9]+]],   [test_s_i8f64p_param_0+12];
+; CHECK-DAG:    ld.param.b8  [[P2_4:%rd[0-9]+]],   [test_s_i8f64p_param_0+13];
+; CHECK-DAG:    ld.param.b8  [[P2_5:%rd[0-9]+]],   [test_s_i8f64p_param_0+14];
+; CHECK-DAG:    ld.param.b8  [[P2_6:%rd[0-9]+]],   [test_s_i8f64p_param_0+15];
+; CHECK-DAG:    ld.param.b8  [[P2_7:%rd[0-9]+]],   [test_s_i8f64p_param_0+16];
 ; CHECK-DAG:    shl.b64      [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8;
 ; CHECK-DAG:    shl.b64      [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16;
 ; CHECK-DAG:    shl.b64      [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24;
@@ -343,7 +343,7 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
 ; CHECK-DAG:    bfe.u64      [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8;
 ; CHECK:        { // callseq
 ; CHECK:        .param .align 8 .b8 param0[32];
-; CHECK-DAG:    st.param.f64 [param0],  [[P0]];
+; CHECK-DAG:    st.param.b64 [param0],  [[P0]];
 ; CHECK-DAG:    st.param.b8  [param0+9],  [[P2]];
 ; CHECK-DAG:    st.param.b8  [param0+10], [[P2_shr_1]];
 ; CHECK-DAG:    st.param.b8  [param0+11], [[P2_shr_2]];
@@ -358,7 +358,7 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
 ; CHECK-NEXT:   (
 ; CHECK-NEXT:   param0
 ; CHECK-NEXT:   );
-; CHECK-DAG:    ld.param.f64 [[R0:%fd[0-9]+]],   [retval0];
+; CHECK-DAG:    ld.param.b64 [[R0:%fd[0-9]+]],   [retval0];
 ; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+9];
 ; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+10];
 ; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+11];
@@ -368,7 +368,7 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
 ; CHECK-DAG:    ld.param.b8  [[R2_6:%rs[0-9]+]], [retval0+15];
 ; CHECK-DAG:    ld.param.b8  [[R2_7:%rs[0-9]+]], [retval0+16];
 ; CHECK:        } // callseq
-; CHECK-DAG:    st.param.f64 [func_retval0], [[R0]];
+; CHECK-DAG:    st.param.b64 [func_retval0], [[R0]];
 ; CHECK-DAG:    st.param.b8  [func_retval0+9],
 ; CHECK-DAG:    st.param.b8  [func_retval0+10],
 ; CHECK-DAG:    st.param.b8  [func_retval0+11],
diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
index 303c649b794fd..8e4c77e76029c 100644
--- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
@@ -11,10 +11,10 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-NEXT:    .reg .b16 %rs<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [out_v1i8_param_0];
-; CHECK-NEXT:    ld.param.u8 %rs2, [out_v1i8_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs1, [out_v1i8_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [out_v1i8_param_2];
 ; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    ld.param.u8 %rs4, [out_v1i8_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs4, [out_v1i8_param_1];
 ; CHECK-NEXT:    not.b16 %rs5, %rs2;
 ; CHECK-NEXT:    and.b16 %rs6, %rs4, %rs5;
 ; CHECK-NEXT:    or.b16 %rs7, %rs3, %rs6;
@@ -37,10 +37,10 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwin
 ; CHECK-NEXT:    .reg .b16 %rs<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [out_v1i16_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [out_v1i16_param_2];
+; CHECK-NEXT:    ld.param.b16 %rs1, [out_v1i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [out_v1i16_param_2];
 ; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    ld.param.u16 %rs4, [out_v1i16_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs4, [out_v1i16_param_1];
 ; CHECK-NEXT:    not.b16 %rs5, %rs2;
 ; CHECK-NEXT:    and.b16 %rs6, %rs4, %rs5;
 ; CHECK-NEXT:    or.b16 %rs7, %rs3, %rs6;
@@ -63,9 +63,9 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [out_v4i8_param_1];
-; CHECK-NEXT:    ld.param.u32 %r2, [out_v4i8_param_0];
-; CHECK-NEXT:    ld.param.u32 %r3, [out_v4i8_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [out_v4i8_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [out_v4i8_param_0];
+; CHECK-NEXT:    ld.param.b32 %r3, [out_v4i8_param_2];
 ; CHECK-NEXT:    and.b32 %r4, %r2, %r3;
 ; CHECK-NEXT:    xor.b32 %r5, %r3, -1;
 ; CHECK-NEXT:    and.b32 %r6, %r1, %r5;
@@ -85,9 +85,9 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [out_v4i8_undef_param_1];
-; CHECK-NEXT:    ld.param.u32 %r2, [out_v4i8_undef_param_0];
-; CHECK-NEXT:    ld.param.u32 %r3, [out_v4i8_undef_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [out_v4i8_undef_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [out_v4i8_undef_param_0];
+; CHECK-NEXT:    ld.param.b32 %r3, [out_v4i8_undef_param_2];
 ; CHECK-NEXT:    and.b32 %r4, %r2, %r3;
 ; CHECK-NEXT:    xor.b32 %r5, %r3, -16711681;
 ; CHECK-NEXT:    and.b32 %r6, %r1, %r5;
@@ -107,9 +107,9 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [out_v2i16_param_1];
-; CHECK-NEXT:    ld.param.u32 %r2, [out_v2i16_param_0];
-; CHECK-NEXT:    ld.param.u32 %r3, [out_v2i16_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [out_v2i16_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [out_v2i16_param_0];
+; CHECK-NEXT:    ld.param.b32 %r3, [out_v2i16_param_2];
 ; CHECK-NEXT:    and.b32 %r4, %r2, %r3;
 ; CHECK-NEXT:    xor.b32 %r5, %r3, -1;
 ; CHECK-NEXT:    and.b32 %r6, %r1, %r5;
@@ -129,10 +129,10 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwin
 ; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [out_v1i32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [out_v1i32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r1, [out_v1i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [out_v1i32_param_2];
 ; CHECK-NEXT:    and.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    ld.param.u32 %r4, [out_v1i32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r4, [out_v1i32_param_1];
 ; CHECK-NEXT:    not.b32 %r5, %r2;
 ; CHECK-NEXT:    and.b32 %r6, %r4, %r5;
 ; CHECK-NEXT:    or.b32 %r7, %r3, %r6;
@@ -155,11 +155,11 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-NEXT:    .reg .b32 %r<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [out_v8i8_param_0];
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [out_v8i8_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [out_v8i8_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v8i8_param_2];
 ; CHECK-NEXT:    and.b32 %r5, %r1, %r3;
 ; CHECK-NEXT:    and.b32 %r6, %r2, %r4;
-; CHECK-NEXT:    ld.param.v2.u32 {%r7, %r8}, [out_v8i8_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [out_v8i8_param_1];
 ; CHECK-NEXT:    xor.b32 %r9, %r4, -1;
 ; CHECK-NEXT:    xor.b32 %r10, %r3, -1;
 ; CHECK-NEXT:    and.b32 %r11, %r7, %r10;
@@ -181,11 +181,11 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-NEXT:    .reg .b32 %r<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [out_v4i16_param_0];
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [out_v4i16_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [out_v4i16_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v4i16_param_2];
 ; CHECK-NEXT:    and.b32 %r5, %r1, %r3;
 ; CHECK-NEXT:    and.b32 %r6, %r2, %r4;
-; CHECK-NEXT:    ld.param.v2.u32 {%r7, %r8}, [out_v4i16_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [out_v4i16_param_1];
 ; CHECK-NEXT:    xor.b32 %r9, %r4, -1;
 ; CHECK-NEXT:    xor.b32 %r10, %r3, -1;
 ; CHECK-NEXT:    and.b32 %r11, %r7, %r10;
@@ -207,11 +207,11 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 ; CHECK-NEXT:    .reg .b32 %r<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [out_v4i16_undef_param_0];
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [out_v4i16_undef_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [out_v4i16_undef_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v4i16_undef_param_2];
 ; CHECK-NEXT:    and.b32 %r5, %r1, %r3;
 ; CHECK-NEXT:    and.b32 %r6, %r2, %r4;
-; CHECK-NEXT:    ld.param.v2.u32 {%r7, %r8}, [out_v4i16_undef_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [out_v4i16_undef_param_1];
 ; CHECK-NEXT:    xor.b32 %r9, %r4, -65536;
 ; CHECK-NEXT:    xor.b32 %r10, %r3, -1;
 ; CHECK-NEXT:    and.b32 %r11, %r7, %r10;
@@ -233,11 +233,11 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwin
 ; CHECK-NEXT:    .reg .b32 %r<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [out_v2i32_param_0];
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [out_v2i32_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [out_v2i32_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_2];
 ; CHECK-NEXT:    and.b32 %r5, %r1, %r3;
 ; CHECK-NEXT:    and.b32 %r6, %r2, %r4;
-; CHECK-NEXT:    ld.param.v2.u32 {%r7, %r8}, [out_v2i32_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [out_v2i32_param_1];
 ; CHECK-NEXT:    not.b32 %r9, %r4;
 ; CHECK-NEXT:    not.b32 %r10, %r3;
 ; CHECK-NEXT:    and.b32 %r11, %r7, %r10;
@@ -259,10 +259,10 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwin
 ; CHECK-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [out_v1i64_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [out_v1i64_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd1, [out_v1i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [out_v1i64_param_2];
 ; CHECK-NEXT:    and.b64 %rd3, %rd1, %rd2;
-; CHECK-NEXT:    ld.param.u64 %rd4, [out_v1i64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd4, [out_v1i64_param_1];
 ; CHECK-NEXT:    not.b64 %rd5, %rd2;
 ; CHECK-NEXT:    and.b64 %rd6, %rd4, %rd5;
 ; CHECK-NEXT:    or.b64 %rd7, %rd3, %rd6;
@@ -285,13 +285,13 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
 ; CHECK-NEXT:    .reg .b32 %r<29>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v16i8_param_0];
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [out_v16i8_param_2];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v16i8_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v16i8_param_2];
 ; CHECK-NEXT:    and.b32 %r9, %r1, %r5;
 ; CHECK-NEXT:    and.b32 %r10, %r2, %r6;
 ; CHECK-NEXT:    and.b32 %r11, %r3, %r7;
 ; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
-; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [out_v16i8_param_1];
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v16i8_param_1];
 ; CHECK-NEXT:    xor.b32 %r17, %r8, -1;
 ; CHECK-NEXT:    xor.b32 %r18, %r7, -1;
 ; CHECK-NEXT:    xor.b32 %r19, %r6, -1;
@@ -319,13 +319,13 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-NEXT:    .reg .b32 %r<29>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v8i16_param_0];
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [out_v8i16_param_2];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v8i16_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v8i16_param_2];
 ; CHECK-NEXT:    and.b32 %r9, %r1, %r5;
 ; CHECK-NEXT:    and.b32 %r10, %r2, %r6;
 ; CHECK-NEXT:    and.b32 %r11, %r3, %r7;
 ; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
-; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [out_v8i16_param_1];
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v8i16_param_1];
 ; CHECK-NEXT:    xor.b32 %r17, %r8, -1;
 ; CHECK-NEXT:    xor.b32 %r18, %r7, -1;
 ; CHECK-NEXT:    xor.b32 %r19, %r6, -1;
@@ -353,13 +353,13 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwin
 ; CHECK-NEXT:    .reg .b32 %r<29>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v4i32_param_0];
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_2];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_2];
 ; CHECK-NEXT:    and.b32 %r9, %r1, %r5;
 ; CHECK-NEXT:    and.b32 %r10, %r2, %r6;
 ; CHECK-NEXT:    and.b32 %r11, %r3, %r7;
 ; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
-; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [out_v4i32_param_1];
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_param_1];
 ; CHECK-NEXT:    not.b32 %r17, %r8;
 ; CHECK-NEXT:    not.b32 %r18, %r7;
 ; CHECK-NEXT:    not.b32 %r19, %r6;
@@ -387,13 +387,13 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) n
 ; CHECK-NEXT:    .reg .b32 %r<26>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v4i32_undef_param_0];
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [out_v4i32_undef_param_2];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_undef_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_undef_param_2];
 ; CHECK-NEXT:    and.b32 %r9, %r3, %r7;
 ; CHECK-NEXT:    and.b32 %r10, %r1, %r5;
 ; CHECK-NEXT:    and.b32 %r11, %r2, %r6;
 ; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
-; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [out_v4i32_undef_param_1];
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_undef_param_1];
 ; CHECK-NEXT:    not.b32 %r17, %r8;
 ; CHECK-NEXT:    not.b32 %r18, %r6;
 ; CHECK-NEXT:    not.b32 %r19, %r5;
@@ -418,11 +418,11 @@ define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwin
 ; CHECK-NEXT:    .reg .b64 %rd<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [out_v2i64_param_0];
-; CHECK-NEXT:    ld.param.v2.u64 {%rd3, %rd4}, [out_v2i64_param_2];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [out_v2i64_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_2];
 ; CHECK-NEXT:    and.b64 %rd5, %rd1, %rd3;
 ; CHECK-NEXT:    and.b64 %rd6, %rd2, %rd4;
-; CHECK-NEXT:    ld.param.v2.u64 {%rd7, %rd8}, [out_v2i64_param_1];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd7, %rd8}, [out_v2i64_param_1];
 ; CHECK-NEXT:    not.b64 %rd9, %rd4;
 ; CHECK-NEXT:    not.b64 %rd10, %rd3;
 ; CHECK-NEXT:    and.b64 %rd11, %rd7, %rd10;
@@ -452,10 +452,10 @@ define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [in_v1i8_param_0];
-; CHECK-NEXT:    ld.param.u8 %rs2, [in_v1i8_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [in_v1i8_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs2, [in_v1i8_param_1];
 ; CHECK-NEXT:    xor.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    ld.param.u8 %rs4, [in_v1i8_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs4, [in_v1i8_param_2];
 ; CHECK-NEXT:    and.b16 %rs5, %rs3, %rs4;
 ; CHECK-NEXT:    xor.b16 %rs6, %rs5, %rs2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs6;
@@ -476,10 +476,10 @@ define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind
 ; CHECK-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u16 %rs1, [in_v1i16_param_0];
-; CHECK-NEXT:    ld.param.u16 %rs2, [in_v1i16_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs1, [in_v1i16_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs2, [in_v1i16_param_1];
 ; CHECK-NEXT:    xor.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    ld.param.u16 %rs4, [in_v1i16_param_2];
+; CHECK-NEXT:    ld.param.b16 %rs4, [in_v1i16_param_2];
 ; CHECK-NEXT:    and.b16 %rs5, %rs3, %rs4;
 ; CHECK-NEXT:    xor.b16 %rs6, %rs5, %rs2;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs6;
@@ -500,10 +500,10 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [in_v4i8_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [in_v4i8_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [in_v4i8_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [in_v4i8_param_1];
 ; CHECK-NEXT:    xor.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    ld.param.u32 %r4, [in_v4i8_param_2];
+; CHECK-NEXT:    ld.param.b32 %r4, [in_v4i8_param_2];
 ; CHECK-NEXT:    and.b32 %r5, %r3, %r4;
 ; CHECK-NEXT:    xor.b32 %r6, %r5, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
@@ -520,10 +520,10 @@ define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [in_v2i16_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [in_v2i16_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [in_v2i16_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [in_v2i16_param_1];
 ; CHECK-NEXT:    xor.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    ld.param.u32 %r4, [in_v2i16_param_2];
+; CHECK-NEXT:    ld.param.b32 %r4, [in_v2i16_param_2];
 ; CHECK-NEXT:    and.b32 %r5, %r3, %r4;
 ; CHECK-NEXT:    xor.b32 %r6, %r5, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
@@ -540,10 +540,10 @@ define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [in_v1i32_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [in_v1i32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [in_v1i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [in_v1i32_param_1];
 ; CHECK-NEXT:    xor.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    ld.param.u32 %r4, [in_v1i32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r4, [in_v1i32_param_2];
 ; CHECK-NEXT:    and.b32 %r5, %r3, %r4;
 ; CHECK-NEXT:    xor.b32 %r6, %r5, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
@@ -564,9 +564,9 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [in_v8i8_param_0];
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [in_v8i8_param_1];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [in_v8i8_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [in_v8i8_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [in_v8i8_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [in_v8i8_param_2];
 ; CHECK-NEXT:    xor.b32 %r7, %r2, %r4;
 ; CHECK-NEXT:    and.b32 %r8, %r7, %r6;
 ; CHECK-NEXT:    xor.b32 %r9, %r8, %r4;
@@ -587,9 +587,9 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind
 ; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [in_v4i16_param_0];
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [in_v4i16_param_1];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [in_v4i16_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [in_v4i16_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [in_v4i16_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [in_v4i16_param_2];
 ; CHECK-NEXT:    xor.b32 %r7, %r2, %r4;
 ; CHECK-NEXT:    and.b32 %r8, %r7, %r6;
 ; CHECK-NEXT:    xor.b32 %r9, %r8, %r4;
@@ -610,11 +610,11 @@ define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind
 ; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [in_v2i32_param_0];
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [in_v2i32_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [in_v2i32_param_0];
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [in_v2i32_param_1];
 ; CHECK-NEXT:    xor.b32 %r5, %r2, %r4;
 ; CHECK-NEXT:    xor.b32 %r6, %r1, %r3;
-; CHECK-NEXT:    ld.param.v2.u32 {%r7, %r8}, [in_v2i32_param_2];
+; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [in_v2i32_param_2];
 ; CHECK-NEXT:    and.b32 %r9, %r6, %r7;
 ; CHECK-NEXT:    and.b32 %r10, %r5, %r8;
 ; CHECK-NEXT:    xor.b32 %r11, %r10, %r4;
@@ -633,10 +633,10 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind
 ; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [in_v1i64_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [in_v1i64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [in_v1i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [in_v1i64_param_1];
 ; CHECK-NEXT:    xor.b64 %rd3, %rd1, %rd2;
-; CHECK-NEXT:    ld.param.u64 %rd4, [in_v1i64_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd4, [in_v1i64_param_2];
 ; CHECK-NEXT:    and.b64 %rd5, %rd3, %rd4;
 ; CHECK-NEXT:    xor.b64 %rd6, %rd5, %rd2;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
@@ -657,13 +657,13 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
 ; CHECK-NEXT:    .reg .b32 %r<25>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [in_v16i8_param_0];
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [in_v16i8_param_1];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [in_v16i8_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [in_v16i8_param_1];
 ; CHECK-NEXT:    xor.b32 %r9, %r4, %r8;
 ; CHECK-NEXT:    xor.b32 %r10, %r3, %r7;
 ; CHECK-NEXT:    xor.b32 %r11, %r2, %r6;
 ; CHECK-NEXT:    xor.b32 %r12, %r1, %r5;
-; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [in_v16i8_param_2];
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [in_v16i8_param_2];
 ; CHECK-NEXT:    and.b32 %r17, %r12, %r13;
 ; CHECK-NEXT:    and.b32 %r18, %r11, %r14;
 ; CHECK-NEXT:    and.b32 %r19, %r10, %r15;
@@ -686,13 +686,13 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind
 ; CHECK-NEXT:    .reg .b32 %r<25>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [in_v8i16_param_0];
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [in_v8i16_param_1];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [in_v8i16_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [in_v8i16_param_1];
 ; CHECK-NEXT:    xor.b32 %r9, %r4, %r8;
 ; CHECK-NEXT:    xor.b32 %r10, %r3, %r7;
 ; CHECK-NEXT:    xor.b32 %r11, %r2, %r6;
 ; CHECK-NEXT:    xor.b32 %r12, %r1, %r5;
-; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [in_v8i16_param_2];
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [in_v8i16_param_2];
 ; CHECK-NEXT:    and.b32 %r17, %r12, %r13;
 ; CHECK-NEXT:    and.b32 %r18, %r11, %r14;
 ; CHECK-NEXT:    and.b32 %r19, %r10, %r15;
@@ -715,13 +715,13 @@ define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind
 ; CHECK-NEXT:    .reg .b32 %r<25>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [in_v4i32_param_0];
-; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [in_v4i32_param_1];
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [in_v4i32_param_0];
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [in_v4i32_param_1];
 ; CHECK-NEXT:    xor.b32 %r9, %r4, %r8;
 ; CHECK-NEXT:    xor.b32 %r10, %r3, %r7;
 ; CHECK-NEXT:    xor.b32 %r11, %r2, %r6;
 ; CHECK-NEXT:    xor.b32 %r12, %r1, %r5;
-; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [in_v4i32_param_2];
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [in_v4i32_param_2];
 ; CHECK-NEXT:    and.b32 %r17, %r12, %r13;
 ; CHECK-NEXT:    and.b32 %r18, %r11, %r14;
 ; CHECK-NEXT:    and.b32 %r19, %r10, %r15;
@@ -744,11 +744,11 @@ define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind
 ; CHECK-NEXT:    .reg .b64 %rd<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [in_v2i64_param_0];
-; CHECK-NEXT:    ld.param.v2.u64 {%rd3, %rd4}, [in_v2i64_param_1];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [in_v2i64_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [in_v2i64_param_1];
 ; CHECK-NEXT:    xor.b64 %rd5, %rd2, %rd4;
 ; CHECK-NEXT:    xor.b64 %rd6, %rd1, %rd3;
-; CHECK-NEXT:    ld.param.v2.u64 {%rd7, %rd8}, [in_v2i64_param_2];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd7, %rd8}, [in_v2i64_param_2];
 ; CHECK-NEXT:    and.b64 %rd9, %rd6, %rd7;
 ; CHECK-NEXT:    and.b64 %rd10, %rd5, %rd8;
 ; CHECK-NEXT:    xor.b64 %rd11, %rd10, %rd4;
diff --git a/llvm/test/CodeGen/NVPTX/vaargs.ll b/llvm/test/CodeGen/NVPTX/vaargs.ll
index 465e2a6a60eb5..0cd0d29294c32 100644
--- a/llvm/test/CodeGen/NVPTX/vaargs.ll
+++ b/llvm/test/CodeGen/NVPTX/vaargs.ll
@@ -17,56 +17,56 @@ entry:
 ; Test va_start
 ; CHECK:         .param .align 8 .b8 foo_vararg[]
 ; CHECK:         mov.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], foo_vararg;
-; CHECK-NEXT:    st.u[[BITS]] [%SP], [[VA_PTR]];
+; CHECK-NEXT:    st.b[[BITS]] [%SP], [[VA_PTR]];
 
   call void @llvm.va_start(ptr %al)
 
 ; Test va_copy()
-; CHECK-NEXT:	 ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP];
-; CHECK-NEXT:	 st.u[[BITS]] [%SP+{{[0-9]+}}], [[VA_PTR]];
+; CHECK-NEXT:	 ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP];
+; CHECK-NEXT:	 st.b[[BITS]] [%SP+{{[0-9]+}}], [[VA_PTR]];
 
   call void @llvm.va_copy(ptr %al2, ptr %al)
 
 ; Test va_arg(ap, int32_t)
-; CHECK-NEXT:    ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP];
+; CHECK-NEXT:    ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP];
 ; CHECK-NEXT:    add.s[[BITS]] [[VA_PTR_TMP:%(r|rd)[0-9]+]], [[VA_PTR]], 3;
 ; CHECK-NEXT:    and.b[[BITS]] [[VA_PTR_ALIGN:%(r|rd)[0-9]+]], [[VA_PTR_TMP]], -4;
 ; CHECK-NEXT:    add.s[[BITS]] [[VA_PTR_NEXT:%(r|rd)[0-9]+]], [[VA_PTR_ALIGN]], 4;
-; CHECK-NEXT:    st.u[[BITS]] [%SP], [[VA_PTR_NEXT]];
-; CHECK-NEXT:    ld.local.u32 %r{{[0-9]+}}, [[[VA_PTR_ALIGN]]];
+; CHECK-NEXT:    st.b[[BITS]] [%SP], [[VA_PTR_NEXT]];
+; CHECK-NEXT:    ld.local.b32 %r{{[0-9]+}}, [[[VA_PTR_ALIGN]]];
 
   %0 = va_arg ptr %al, i32
 
 ; Test va_arg(ap, int64_t)
-; CHECK-NEXT:    ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP];
+; CHECK-NEXT:    ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP];
 ; CHECK-NEXT:    add.s[[BITS]] [[VA_PTR_TMP:%(r|rd)[0-9]+]], [[VA_PTR]], 7;
 ; CHECK-NEXT:    and.b[[BITS]] [[VA_PTR_ALIGN:%(r|rd)[0-9]+]], [[VA_PTR_TMP]], -8;
 ; CHECK-NEXT:    add.s[[BITS]] [[VA_PTR_NEXT:%(r|rd)[0-9]+]], [[VA_PTR_ALIGN]], 8;
-; CHECK-NEXT:    st.u[[BITS]] [%SP], [[VA_PTR_NEXT]];
-; CHECK-NEXT:    ld.local.u64 %rd{{[0-9]+}}, [[[VA_PTR_ALIGN]]];
+; CHECK-NEXT:    st.b[[BITS]] [%SP], [[VA_PTR_NEXT]];
+; CHECK-NEXT:    ld.local.b64 %rd{{[0-9]+}}, [[[VA_PTR_ALIGN]]];
 
   %1 = va_arg ptr %al, i64
 
 ; Test va_arg(ap, double)
-; CHECK-NEXT:    ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP];
+; CHECK-NEXT:    ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP];
 ; CHECK-NEXT:    add.s[[BITS]] [[VA_PTR_TMP:%(r|rd)[0-9]+]], [[VA_PTR]], 7;
 ; CHECK-NEXT:    and.b[[BITS]] [[VA_PTR_ALIGN:%(r|rd)[0-9]+]], [[VA_PTR_TMP]], -8;
 ; CHECK-NEXT:    add.s[[BITS]] [[VA_PTR_NEXT:%(r|rd)[0-9]+]], [[VA_PTR_ALIGN]], 8;
-; CHECK-NEXT:    st.u[[BITS]] [%SP], [[VA_PTR_NEXT]];
-; CHECK-NEXT:    ld.local.f64 %fd{{[0-9]+}}, [[[VA_PTR_ALIGN]]];
+; CHECK-NEXT:    st.b[[BITS]] [%SP], [[VA_PTR_NEXT]];
+; CHECK-NEXT:    ld.local.b64 %fd{{[0-9]+}}, [[[VA_PTR_ALIGN]]];
 
   %2 = va_arg ptr %al, double
 
 ; Test va_arg(ap, ptr)
-; CHECK-NEXT:    ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP];
+; CHECK-NEXT:    ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP];
 ; CHECK32-NEXT:  add.s32 [[VA_PTR_TMP:%r[0-9]+]], [[VA_PTR]], 3;
 ; CHECK64-NEXT:  add.s64 [[VA_PTR_TMP:%rd[0-9]+]], [[VA_PTR]], 7;
 ; CHECK32-NEXT:  and.b32 [[VA_PTR_ALIGN:%r[0-9]+]], [[VA_PTR_TMP]], -4;
 ; CHECK64-NEXT:  and.b64 [[VA_PTR_ALIGN:%rd[0-9]+]], [[VA_PTR_TMP]], -8;
 ; CHECK32-NEXT:  add.s32 [[VA_PTR_NEXT:%r[0-9]+]], [[VA_PTR_ALIGN]], 4;
 ; CHECK64-NEXT:  add.s64 [[VA_PTR_NEXT:%rd[0-9]+]], [[VA_PTR_ALIGN]], 8;
-; CHECK-NEXT:    st.u[[BITS]] [%SP], [[VA_PTR_NEXT]];
-; CHECK-NEXT:    ld.local.u[[BITS]] %{{(r|rd)[0-9]+}}, [[[VA_PTR_ALIGN]]];
+; CHECK-NEXT:    st.b[[BITS]] [%SP], [[VA_PTR_NEXT]];
+; CHECK-NEXT:    ld.local.b[[BITS]] %{{(r|rd)[0-9]+}}, [[[VA_PTR_ALIGN]]];
 
   %3 = va_arg ptr %al, ptr
   %call = call i32 @bar(i32 %a, i32 %0, i64 %1, double %2, ptr %3)
@@ -82,18 +82,18 @@ define i32 @test_foo(i32 %i, i64 %l, double %d, ptr %p) {
 ; Test indirect variadic function call.
 
 ; Load arguments to temporary variables
-; CHECK32:       ld.param.u32 [[ARG_VOID_PTR:%r[0-9]+]], [test_foo_param_3];
-; CHECK64:       ld.param.u64 [[ARG_VOID_PTR:%rd[0-9]+]], [test_foo_param_3];
-; CHECK-NEXT:    ld.param.f64 [[ARG_DOUBLE:%fd[0-9]+]], [test_foo_param_2];
-; CHECK-NEXT:    ld.param.u64 [[ARG_I64:%rd[0-9]+]], [test_foo_param_1];
-; CHECK-NEXT:    ld.param.u32 [[ARG_I32:%r[0-9]+]], [test_foo_param_0];
+; CHECK32:       ld.param.b32 [[ARG_VOID_PTR:%r[0-9]+]], [test_foo_param_3];
+; CHECK64:       ld.param.b64 [[ARG_VOID_PTR:%rd[0-9]+]], [test_foo_param_3];
+; CHECK-NEXT:    ld.param.b64 [[ARG_DOUBLE:%fd[0-9]+]], [test_foo_param_2];
+; CHECK-NEXT:    ld.param.b64 [[ARG_I64:%rd[0-9]+]], [test_foo_param_1];
+; CHECK-NEXT:    ld.param.b32 [[ARG_I32:%r[0-9]+]], [test_foo_param_0];
 
 ; Store arguments to an array
 ; CHECK32:  .param .align 8 .b8 param1[28];
 ; CHECK64:  .param .align 8 .b8 param1[32];
 ; CHECK-NEXT:    st.param.b32 [param1], [[ARG_I32]];
 ; CHECK-NEXT:    st.param.b64 [param1+8], [[ARG_I64]];
-; CHECK-NEXT:    st.param.f64 [param1+16], [[ARG_DOUBLE]];
+; CHECK-NEXT:    st.param.b64 [param1+16], [[ARG_DOUBLE]];
 ; CHECK-NEXT:    st.param.b[[BITS]] [param1+24], [[ARG_VOID_PTR]];
 ; CHECK-NEXT:    .param .b32 retval0;
 ; CHECK-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .b32 _, .param .align 8 .b8 _[]
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 9da361455a656..3235587f3d563 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -16,29 +16,29 @@ define dso_local i32 @variadics1(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    .reg .b64 %fd<7>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [variadics1_param_0];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [variadics1_param_1];
-; CHECK-PTX-NEXT:    ld.u32 %r2, [%rd1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [variadics1_param_0];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [variadics1_param_1];
+; CHECK-PTX-NEXT:    ld.b32 %r2, [%rd1];
 ; CHECK-PTX-NEXT:    add.s32 %r3, %r1, %r2;
-; CHECK-PTX-NEXT:    ld.u32 %r4, [%rd1+4];
+; CHECK-PTX-NEXT:    ld.b32 %r4, [%rd1+4];
 ; CHECK-PTX-NEXT:    add.s32 %r5, %r3, %r4;
-; CHECK-PTX-NEXT:    ld.u32 %r6, [%rd1+8];
+; CHECK-PTX-NEXT:    ld.b32 %r6, [%rd1+8];
 ; CHECK-PTX-NEXT:    add.s32 %r7, %r5, %r6;
 ; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 19;
 ; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
-; CHECK-PTX-NEXT:    ld.u64 %rd4, [%rd3];
+; CHECK-PTX-NEXT:    ld.b64 %rd4, [%rd3];
 ; CHECK-PTX-NEXT:    cvt.u64.u32 %rd5, %r7;
 ; CHECK-PTX-NEXT:    add.s64 %rd6, %rd5, %rd4;
 ; CHECK-PTX-NEXT:    cvt.u32.u64 %r8, %rd6;
 ; CHECK-PTX-NEXT:    add.s64 %rd7, %rd3, 15;
 ; CHECK-PTX-NEXT:    and.b64 %rd8, %rd7, -8;
-; CHECK-PTX-NEXT:    ld.f64 %fd1, [%rd8];
+; CHECK-PTX-NEXT:    ld.b64 %fd1, [%rd8];
 ; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd2, %r8;
 ; CHECK-PTX-NEXT:    add.rn.f64 %fd3, %fd2, %fd1;
 ; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r9, %fd3;
 ; CHECK-PTX-NEXT:    add.s64 %rd9, %rd8, 15;
 ; CHECK-PTX-NEXT:    and.b64 %rd10, %rd9, -8;
-; CHECK-PTX-NEXT:    ld.f64 %fd4, [%rd10];
+; CHECK-PTX-NEXT:    ld.b64 %fd4, [%rd10];
 ; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd5, %r9;
 ; CHECK-PTX-NEXT:    add.rn.f64 %fd6, %fd5, %fd4;
 ; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r10, %fd6;
@@ -112,14 +112,14 @@ define dso_local i32 @foo() {
 ; CHECK-PTX-NEXT:    mov.b64 %SPL, __local_depot1;
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-PTX-NEXT:    mov.b64 %rd1, 4294967297;
-; CHECK-PTX-NEXT:    st.u64 [%SP], %rd1;
+; CHECK-PTX-NEXT:    st.b64 [%SP], %rd1;
 ; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
-; CHECK-PTX-NEXT:    st.u32 [%SP+8], %r1;
+; CHECK-PTX-NEXT:    st.b32 [%SP+8], %r1;
 ; CHECK-PTX-NEXT:    mov.b64 %rd2, 1;
-; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd2;
+; CHECK-PTX-NEXT:    st.b64 [%SP+16], %rd2;
 ; CHECK-PTX-NEXT:    mov.b64 %rd3, 4607182418800017408;
-; CHECK-PTX-NEXT:    st.u64 [%SP+24], %rd3;
-; CHECK-PTX-NEXT:    st.u64 [%SP+32], %rd3;
+; CHECK-PTX-NEXT:    st.b64 [%SP+24], %rd3;
+; CHECK-PTX-NEXT:    st.b64 [%SP+32], %rd3;
 ; CHECK-PTX-NEXT:    add.u64 %rd4, %SP, 0;
 ; CHECK-PTX-NEXT:    { // callseq 0, 0
 ; CHECK-PTX-NEXT:    .param .b32 param0;
@@ -157,20 +157,20 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.b64 %SPL, __local_depot2;
-; CHECK-PTX-NEXT:    ld.param.u32 %r1, [variadics2_param_0];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [variadics2_param_1];
+; CHECK-PTX-NEXT:    ld.param.b32 %r1, [variadics2_param_0];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [variadics2_param_1];
 ; CHECK-PTX-NEXT:    add.u64 %rd3, %SPL, 0;
 ; CHECK-PTX-NEXT:    add.s64 %rd4, %rd1, 7;
 ; CHECK-PTX-NEXT:    and.b64 %rd5, %rd4, -8;
-; CHECK-PTX-NEXT:    ld.u32 %r2, [%rd5];
+; CHECK-PTX-NEXT:    ld.b32 %r2, [%rd5];
 ; CHECK-PTX-NEXT:    ld.s8 %r3, [%rd5+4];
-; CHECK-PTX-NEXT:    ld.u8 %rs1, [%rd5+7];
-; CHECK-PTX-NEXT:    st.local.u8 [%rd3+2], %rs1;
-; CHECK-PTX-NEXT:    ld.u8 %rs2, [%rd5+6];
-; CHECK-PTX-NEXT:    st.local.u8 [%rd3+1], %rs2;
-; CHECK-PTX-NEXT:    ld.u8 %rs3, [%rd5+5];
-; CHECK-PTX-NEXT:    st.local.u8 [%rd3], %rs3;
-; CHECK-PTX-NEXT:    ld.u64 %rd6, [%rd5+8];
+; CHECK-PTX-NEXT:    ld.b8 %rs1, [%rd5+7];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd3+2], %rs1;
+; CHECK-PTX-NEXT:    ld.b8 %rs2, [%rd5+6];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd3+1], %rs2;
+; CHECK-PTX-NEXT:    ld.b8 %rs3, [%rd5+5];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd3], %rs3;
+; CHECK-PTX-NEXT:    ld.b64 %rd6, [%rd5+8];
 ; CHECK-PTX-NEXT:    add.s32 %r4, %r1, %r2;
 ; CHECK-PTX-NEXT:    add.s32 %r5, %r4, %r3;
 ; CHECK-PTX-NEXT:    cvt.u64.u32 %rd7, %r5;
@@ -220,21 +220,21 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    mov.b64 %SPL, __local_depot3;
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
-; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs1, [__const_$_bar_$_s1+7];
+; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7];
 ; CHECK-PTX-NEXT:    cvt.u16.u8 %rs2, %rs1;
-; CHECK-PTX-NEXT:    st.local.u8 [%rd2+2], %rs2;
-; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs3, [__const_$_bar_$_s1+6];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2+2], %rs2;
+; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+6];
 ; CHECK-PTX-NEXT:    cvt.u16.u8 %rs4, %rs3;
-; CHECK-PTX-NEXT:    st.local.u8 [%rd2+1], %rs4;
-; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs5, [__const_$_bar_$_s1+5];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2+1], %rs4;
+; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs5, [__const_$_bar_$_s1+5];
 ; CHECK-PTX-NEXT:    cvt.u16.u8 %rs6, %rs5;
-; CHECK-PTX-NEXT:    st.local.u8 [%rd2], %rs6;
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2], %rs6;
 ; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
-; CHECK-PTX-NEXT:    st.u32 [%SP+8], %r1;
+; CHECK-PTX-NEXT:    st.b32 [%SP+8], %r1;
 ; CHECK-PTX-NEXT:    mov.b16 %rs7, 1;
-; CHECK-PTX-NEXT:    st.u8 [%SP+12], %rs7;
+; CHECK-PTX-NEXT:    st.b8 [%SP+12], %rs7;
 ; CHECK-PTX-NEXT:    mov.b64 %rd3, 1;
-; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd3;
+; CHECK-PTX-NEXT:    st.b64 [%SP+16], %rd3;
 ; CHECK-PTX-NEXT:    add.u64 %rd4, %SP, 8;
 ; CHECK-PTX-NEXT:    { // callseq 1, 0
 ; CHECK-PTX-NEXT:    .param .b32 param0;
@@ -269,10 +269,10 @@ define dso_local i32 @variadics3(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
-; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [variadics3_param_1];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd1, [variadics3_param_1];
 ; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 15;
 ; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -16;
-; CHECK-PTX-NEXT:    ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd3];
+; CHECK-PTX-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd3];
 ; CHECK-PTX-NEXT:    add.s32 %r5, %r1, %r2;
 ; CHECK-PTX-NEXT:    add.s32 %r6, %r5, %r3;
 ; CHECK-PTX-NEXT:    add.s32 %r7, %r6, %r4;
@@ -311,7 +311,7 @@ define dso_local i32 @baz() {
 ; CHECK-PTX-NEXT:    mov.b64 %SPL, __local_depot5;
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
-; CHECK-PTX-NEXT:    st.v4.u32 [%SP], {%r1, %r1, %r1, %r1};
+; CHECK-PTX-NEXT:    st.v4.b32 [%SP], {%r1, %r1, %r1, %r1};
 ; CHECK-PTX-NEXT:    add.u64 %rd1, %SP, 0;
 ; CHECK-PTX-NEXT:    { // callseq 2, 0
 ; CHECK-PTX-NEXT:    .param .b32 param0;
@@ -341,12 +341,12 @@ define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, .
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<10>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
-; CHECK-PTX-NEXT:    ld.param.u64 %rd2, [variadics4_param_1];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd2, [variadics4_param_1];
 ; CHECK-PTX-NEXT:    add.s64 %rd3, %rd2, 7;
 ; CHECK-PTX-NEXT:    and.b64 %rd4, %rd3, -8;
-; CHECK-PTX-NEXT:    ld.u64 %rd5, [%rd4];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd6, [variadics4_param_0];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd7, [variadics4_param_0+8];
+; CHECK-PTX-NEXT:    ld.b64 %rd5, [%rd4];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd6, [variadics4_param_0];
+; CHECK-PTX-NEXT:    ld.param.b64 %rd7, [variadics4_param_0+8];
 ; CHECK-PTX-NEXT:    add.s64 %rd8, %rd6, %rd7;
 ; CHECK-PTX-NEXT:    add.s64 %rd9, %rd8, %rd5;
 ; CHECK-PTX-NEXT:    cvt.u32.u64 %r1, %rd9;
@@ -385,14 +385,14 @@ define dso_local void @qux() {
 ; CHECK-PTX-NEXT:    mov.b64 %SPL, __local_depot7;
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
-; CHECK-PTX-NEXT:    ld.global.nc.u64 %rd3, [__const_$_qux_$_s+8];
-; CHECK-PTX-NEXT:    st.local.u64 [%rd2+8], %rd3;
-; CHECK-PTX-NEXT:    ld.global.nc.u64 %rd4, [__const_$_qux_$_s];
-; CHECK-PTX-NEXT:    st.local.u64 [%rd2], %rd4;
+; CHECK-PTX-NEXT:    ld.global.nc.b64 %rd3, [__const_$_qux_$_s+8];
+; CHECK-PTX-NEXT:    st.local.b64 [%rd2+8], %rd3;
+; CHECK-PTX-NEXT:    ld.global.nc.b64 %rd4, [__const_$_qux_$_s];
+; CHECK-PTX-NEXT:    st.local.b64 [%rd2], %rd4;
 ; CHECK-PTX-NEXT:    mov.b64 %rd5, 1;
-; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd5;
-; CHECK-PTX-NEXT:    ld.local.u64 %rd6, [%rd2];
-; CHECK-PTX-NEXT:    ld.local.u64 %rd7, [%rd2+8];
+; CHECK-PTX-NEXT:    st.b64 [%SP+16], %rd5;
+; CHECK-PTX-NEXT:    ld.local.b64 %rd6, [%rd2];
+; CHECK-PTX-NEXT:    ld.local.b64 %rd7, [%rd2+8];
 ; CHECK-PTX-NEXT:    add.u64 %rd8, %SP, 16;
 ; CHECK-PTX-NEXT:    { // callseq 3, 0
 ; CHECK-PTX-NEXT:    .param .align 8 .b8 param0[16];
diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
index 5dea424c7dcc9..8710d58ce6e99 100644
--- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll
+++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
@@ -5,40 +5,40 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 define <16 x float> @test_v16f32(<16 x float> %a) {
 ; CHECK-LABEL: test_v16f32(
-; CHECK-DAG: ld.param.v4.f32     {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];
-; CHECK-DAG: ld.param.v4.f32     {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];
-; CHECK-DAG: ld.param.v4.f32     {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];
-; CHECK-DAG: ld.param.v4.f32     {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0];
-; CHECK-DAG: st.param.v4.f32     [func_retval0],  {[[V_0_3]]}
-; CHECK-DAG: st.param.v4.f32     [func_retval0+16], {[[V_4_7]]}
-; CHECK-DAG: st.param.v4.f32     [func_retval0+32], {[[V_8_11]]}
-; CHECK-DAG: st.param.v4.f32     [func_retval0+48], {[[V_12_15]]}
+; CHECK-DAG: ld.param.v4.b32     {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];
+; CHECK-DAG: ld.param.v4.b32     {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];
+; CHECK-DAG: ld.param.v4.b32     {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];
+; CHECK-DAG: ld.param.v4.b32     {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0];
+; CHECK-DAG: st.param.v4.b32     [func_retval0],  {[[V_0_3]]}
+; CHECK-DAG: st.param.v4.b32     [func_retval0+16], {[[V_4_7]]}
+; CHECK-DAG: st.param.v4.b32     [func_retval0+32], {[[V_8_11]]}
+; CHECK-DAG: st.param.v4.b32     [func_retval0+48], {[[V_12_15]]}
 ; CHECK: ret;
   ret <16 x float> %a
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a) {
 ; CHECK-LABEL: test_v8f32(
-; CHECK-DAG: ld.param.v4.f32     {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16];
-; CHECK-DAG: ld.param.v4.f32     {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0];
-; CHECK-DAG: st.param.v4.f32     [func_retval0],  {[[V_0_3]]}
-; CHECK-DAG: st.param.v4.f32     [func_retval0+16], {[[V_4_7]]}
+; CHECK-DAG: ld.param.v4.b32     {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16];
+; CHECK-DAG: ld.param.v4.b32     {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0];
+; CHECK-DAG: st.param.v4.b32     [func_retval0],  {[[V_0_3]]}
+; CHECK-DAG: st.param.v4.b32     [func_retval0+16], {[[V_4_7]]}
 ; CHECK: ret;
   ret <8 x float> %a
 }
 
 define <4 x float> @test_v4f32(<4 x float> %a) {
 ; CHECK-LABEL: test_v4f32(
-; CHECK-DAG: ld.param.v4.f32     {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0];
-; CHECK-DAG: st.param.v4.f32     [func_retval0],  {[[V_0_3]]}
+; CHECK-DAG: ld.param.v4.b32     {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0];
+; CHECK-DAG: st.param.v4.b32     [func_retval0],  {[[V_0_3]]}
 ; CHECK: ret;
   ret <4 x float> %a
 }
 
 define <2 x float> @test_v2f32(<2 x float> %a) {
 ; CHECK-LABEL: test_v2f32(
-; CHECK-DAG: ld.param.v2.f32     {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0];
-; CHECK-DAG: st.param.v2.f32     [func_retval0],  {[[V_0_3]]}
+; CHECK-DAG: ld.param.v2.b32     {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0];
+; CHECK-DAG: st.param.v2.b32     [func_retval0],  {[[V_0_3]]}
 ; CHECK: ret;
   ret <2 x float> %a
 }
@@ -46,20 +46,20 @@ define <2 x float> @test_v2f32(<2 x float> %a) {
 ; Oddly shaped vectors should not load any extra elements.
 define <3 x float> @test_v3f32(<3 x float> %a) {
 ; CHECK-LABEL: test_v3f32(
-; CHECK-DAG: ld.param.f32        [[V_2:%f[0-9]+]], [test_v3f32_param_0+8];
-; CHECK-DAG: ld.param.v2.f32     {[[V_0_1:(%f[0-9]+[, ]*){2}]]}, [test_v3f32_param_0];
-; CHECK-DAG: st.param.v2.f32     [func_retval0], {[[V_0_1]]}
-; CHECK-DAG: st.param.f32        [func_retval0+8], [[V_2]]
+; CHECK-DAG: ld.param.b32        [[V_2:%f[0-9]+]], [test_v3f32_param_0+8];
+; CHECK-DAG: ld.param.v2.b32     {[[V_0_1:(%f[0-9]+[, ]*){2}]]}, [test_v3f32_param_0];
+; CHECK-DAG: st.param.v2.b32     [func_retval0], {[[V_0_1]]}
+; CHECK-DAG: st.param.b32        [func_retval0+8], [[V_2]]
 ; CHECK: ret;
   ret <3 x float> %a
 }
 
 define <8 x i64> @test_v8i64(<8 x i64> %a) {
 ; CHECK-LABEL: test_v8i64(
-; CHECK-DAG: ld.param.v2.u64     {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48];
-; CHECK-DAG: ld.param.v2.u64     {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32];
-; CHECK-DAG: ld.param.v2.u64     {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16];
-; CHECK-DAG: ld.param.v2.u64     {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0];
+; CHECK-DAG: ld.param.v2.b64     {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48];
+; CHECK-DAG: ld.param.v2.b64     {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32];
+; CHECK-DAG: ld.param.v2.b64     {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16];
+; CHECK-DAG: ld.param.v2.b64     {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0];
 ; CHECK-DAG: st.param.v2.b64     [func_retval0],  {[[V_0_1]]}
 ; CHECK-DAG: st.param.v2.b64     [func_retval0+16], {[[V_2_3]]}
 ; CHECK-DAG: st.param.v2.b64     [func_retval0+32], {[[V_4_5]]}
@@ -70,8 +70,8 @@ define <8 x i64> @test_v8i64(<8 x i64> %a) {
 
 define <16 x i16> @test_v16i16(<16 x i16> %a) {
 ; CHECK-LABEL: test_v16i16(
-; CHECK-DAG: ld.param.v4.u32     {[[V_8_15:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16];
-; CHECK-DAG: ld.param.v4.u32     {[[V_0_7:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0];
+; CHECK-DAG: ld.param.v4.b32     {[[V_8_15:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16];
+; CHECK-DAG: ld.param.v4.b32     {[[V_0_7:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0];
 ; CHECK-DAG: st.param.v4.b32     [func_retval0], {[[V_0_7]]}
 ; CHECK-DAG: st.param.v4.b32     [func_retval0+16], {[[V_8_15]]}
 ; CHECK: ret;
diff --git a/llvm/test/CodeGen/NVPTX/vec8.ll b/llvm/test/CodeGen/NVPTX/vec8.ll
index 3a3dd8072abfd..b44c084bd7b83 100644
--- a/llvm/test/CodeGen/NVPTX/vec8.ll
+++ b/llvm/test/CodeGen/NVPTX/vec8.ll
@@ -5,10 +5,10 @@ target triple = "nvptx-unknown-cuda"
 
 ; CHECK: .visible .func foo
 define void @foo(<8 x i8> %a, ptr %b) {
-; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo_param_0]
-; CHECK-DAG: ld.param.u64   %[[B:rd[0-9+]]], [foo_param_1]
+; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo_param_0]
+; CHECK-DAG: ld.param.b64   %[[B:rd[0-9+]]], [foo_param_1]
 ; CHECK:     add.s16        [[T:%rs[0-9+]]],
-; CHECK:     st.u8          [%[[B]]], [[T]];
+; CHECK:     st.b8          [%[[B]]], [[T]];
   %t0 = extractelement <8 x i8> %a, i32 1
   %t1 = extractelement <8 x i8> %a, i32 6
   %t  = add i8 %t0, %t1
diff --git a/llvm/test/CodeGen/NVPTX/vector-args.ll b/llvm/test/CodeGen/NVPTX/vector-args.ll
index bc1a138e25bd8..192cd562d67b9 100644
--- a/llvm/test/CodeGen/NVPTX/vector-args.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-args.ll
@@ -4,7 +4,7 @@
 define float @foo(<2 x float> %a) {
 ; CHECK: .func (.param .b32 func_retval0) foo
 ; CHECK: .param .align 8 .b8 foo_param_0[8]
-; CHECK: ld.param.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK: ld.param.v2.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
   %t1 = fmul <2 x float> %a, %a
   %t2 = extractelement <2 x float> %t1, i32 0
   %t3 = extractelement <2 x float> %t1, i32 1
@@ -16,7 +16,7 @@ define float @foo(<2 x float> %a) {
 define float @bar(<4 x float> %a) {
 ; CHECK: .func (.param .b32 func_retval0) bar
 ; CHECK: .param .align 16 .b8 bar_param_0[16]
-; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK: ld.param.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   %t1 = fmul <4 x float> %a, %a
   %t2 = extractelement <4 x float> %t1, i32 0
   %t3 = extractelement <4 x float> %t1, i32 1
@@ -28,8 +28,8 @@ define float @bar(<4 x float> %a) {
 define <4 x float> @baz(<4 x float> %a) {
 ; CHECK: .func  (.param .align 16 .b8 func_retval0[16]) baz
 ; CHECK: .param .align 16 .b8 baz_param_0[16]
-; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
-; CHECK: st.param.v4.f32 [func_retval0], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK: ld.param.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK: st.param.v4.b32 [func_retval0], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   %t1 = fmul <4 x float> %a, %a
   ret <4 x float> %t1
 }
diff --git a/llvm/test/CodeGen/NVPTX/vector-call.ll b/llvm/test/CodeGen/NVPTX/vector-call.ll
index 83439e7744fa8..27063f833b7db 100644
--- a/llvm/test/CodeGen/NVPTX/vector-call.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-call.ll
@@ -6,7 +6,7 @@ target triple = "nvptx-unknown-cuda"
 declare void @bar(<4 x i32>)
 
 ; CHECK-LABEL: .func foo(
-; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];
+; CHECK-DAG: ld.param.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];
 ; CHECK: .param .align 16 .b8 param0[16];
 ; CHECK-DAG: st.param.v4.b32  [param0],  {[[E0]], [[E1]], [[E2]], [[E3]]};
 ; CHECK:     call.uni
@@ -17,8 +17,8 @@ define void @foo(<4 x i32> %a) {
 }
 
 ; CHECK-LABEL: .func foo3(
-; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0];
-; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8];
+; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0];
+; CHECK-DAG: ld.param.b32 [[E2:%r[0-9]+]], [foo3_param_0+8];
 ; CHECK: .param .align 16 .b8 param0[16];
 ; CHECK-DAG: st.param.v2.b32  [param0],  {[[E0]], [[E1]]};
 ; CHECK-DAG: st.param.b32     [param0+8],  [[E2]];
diff --git a/llvm/test/CodeGen/NVPTX/vector-compare.ll b/llvm/test/CodeGen/NVPTX/vector-compare.ll
index 4a1335f13b22a..0e63ee96932d9 100644
--- a/llvm/test/CodeGen/NVPTX/vector-compare.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-compare.ll
@@ -9,9 +9,9 @@
 
 ; CHECK-LABEL: .visible .func foo(
 define void @foo(ptr %a, ptr %b, ptr %r1, ptr %r2) {
-; CHECK: ld.v2.u32
+; CHECK: ld.v2.b32
   %aval = load <2 x i32>, ptr %a
-; CHECK: ld.v2.u32
+; CHECK: ld.v2.b32
   %bval = load <2 x i32>, ptr %b
 ; CHECK: setp.lt.s32
 ; CHECK: setp.lt.s32
@@ -22,8 +22,8 @@ define void @foo(ptr %a, ptr %b, ptr %r1, ptr %r2) {
 ; CHECK: selp.b32        %r{{[0-9]+}}, 1, 0
   %t1a = zext i1 %t1 to i32
   %t2a = zext i1 %t2 to i32
-; CHECK: st.u32
-; CHECK: st.u32
+; CHECK: st.b32
+; CHECK: st.b32
   store i32 %t1a, ptr %r1
   store i32 %t2a, ptr %r2
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll
index d731985ae9710..825a66ec04b5e 100644
--- a/llvm/test/CodeGen/NVPTX/vector-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll
@@ -10,7 +10,7 @@
 
 ; CHECK-LABEL: foo
 define void @foo(ptr %a) {
-; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK: ld.v2.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
   %t1 = load <2 x float>, ptr %a
   %t2 = fmul <2 x float> %t1, %t1
   store <2 x float> %t2, ptr %a
@@ -19,7 +19,7 @@ define void @foo(ptr %a) {
 
 ; CHECK-LABEL: foo2
 define void @foo2(ptr %a) {
-; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK: ld.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   %t1 = load <4 x float>, ptr %a
   %t2 = fmul <4 x float> %t1, %t1
   store <4 x float> %t2, ptr %a
@@ -28,8 +28,8 @@ define void @foo2(ptr %a) {
 
 ; CHECK-LABEL: foo3
 define void @foo3(ptr %a) {
-; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
-; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK: ld.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK-NEXT: ld.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   %t1 = load <8 x float>, ptr %a
   %t2 = fmul <8 x float> %t1, %t1
   store <8 x float> %t2, ptr %a
@@ -40,7 +40,7 @@ define void @foo3(ptr %a) {
 
 ; CHECK-LABEL: foo4
 define void @foo4(ptr %a) {
-; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}
+; CHECK: ld.v2.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}}
   %t1 = load <2 x i32>, ptr %a
   %t2 = mul <2 x i32> %t1, %t1
   store <2 x i32> %t2, ptr %a
@@ -49,7 +49,7 @@ define void @foo4(ptr %a) {
 
 ; CHECK-LABEL: foo5
 define void @foo5(ptr %a) {
-; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+; CHECK: ld.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   %t1 = load <4 x i32>, ptr %a
   %t2 = mul <4 x i32> %t1, %t1
   store <4 x i32> %t2, ptr %a
@@ -58,8 +58,8 @@ define void @foo5(ptr %a) {
 
 ; CHECK-LABEL: foo6
 define void @foo6(ptr %a) {
-; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
-; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+; CHECK: ld.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+; CHECK-NEXT: ld.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   %t1 = load <8 x i32>, ptr %a
   %t2 = mul <8 x i32> %t1, %t1
   store <8 x i32> %t2, ptr %a
@@ -86,7 +86,7 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177
   %t11 = zext i32 %t10 to i64
   %t20 = zext i32 %t2 to i64
   %t27 = getelementptr inbounds [1024 x [131072 x i8]], ptr %alloc0, i64 0, i64 %t20, i64 %t9
-; CHECK: ld.v2.u8
+; CHECK: ld.v2.b8
   %t28 = load i8, ptr %t27, align 2
   %t31 = getelementptr inbounds [1024 x [131072 x i8]], ptr %alloc0, i64 0, i64 %t20, i64 %t11
   %t32 = load i8, ptr %t31, align 1
@@ -114,8 +114,8 @@ define void @extv8f16_global_a16(ptr addrspace(1) noalias readonly align 16 %dst
 ; CHECK: cvt.f32.f16 %f{{.*}}, %rs
 ; CHECK: cvt.f32.f16 %f{{.*}}, %rs
   %ext = fpext <8 x half> %v to <8 x float>
-; CHECK: st.global.v4.f32
-; CHECK: st.global.v4.f32
+; CHECK: st.global.v4.b32
+; CHECK: st.global.v4.b32
   store <8 x float> %ext, ptr addrspace(1) %dst, align 16
   ret void
 }
@@ -140,8 +140,8 @@ define void @extv8f16_global_a4(ptr addrspace(1) noalias readonly align 16 %dst,
 ; CHECK: cvt.f32.f16 %f{{.*}}, %rs
 ; CHECK: cvt.f32.f16 %f{{.*}}, %rs
   %ext = fpext <8 x half> %v to <8 x float>
-; CHECK: st.global.v4.f32
-; CHECK: st.global.v4.f32
+; CHECK: st.global.v4.b32
+; CHECK: st.global.v4.b32
   store <8 x float> %ext, ptr addrspace(1) %dst, align 16
   ret void
 }
@@ -164,8 +164,8 @@ define void @extv8f16_generic_a16(ptr noalias readonly align 16 %dst, ptr noalia
 ; CHECK: cvt.f32.f16 %f{{.*}}, %rs
 ; CHECK: cvt.f32.f16 %f{{.*}}, %rs
   %ext = fpext <8 x half> %v to <8 x float>
-; CHECK: st.v4.f32
-; CHECK: st.v4.f32
+; CHECK: st.v4.b32
+; CHECK: st.v4.b32
   store <8 x float> %ext, ptr %dst, align 16
   ret void
 }
@@ -190,8 +190,8 @@ define void @extv8f16_generic_a4(ptr noalias readonly align 16 %dst, ptr noalias
 ; CHECK: cvt.f32.f16 %f{{.*}}, %rs
 ; CHECK: cvt.f32.f16 %f{{.*}}, %rs
   %ext = fpext <8 x half> %v to <8 x float>
-; CHECK: st.v4.f32
-; CHECK: st.v4.f32
+; CHECK: st.v4.b32
+; CHECK: st.v4.b32
   store <8 x float> %ext, ptr %dst, align 16
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/vector-select.ll b/llvm/test/CodeGen/NVPTX/vector-select.ll
index 90d7e24c7ea78..569da5e6628b0 100644
--- a/llvm/test/CodeGen/NVPTX/vector-select.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-select.ll
@@ -9,9 +9,9 @@
 ; CHECK-LABEL: .visible .func foo(
 define void @foo(ptr addrspace(1) %def_a, ptr addrspace(1) %def_b, ptr addrspace(1) %def_c) {
 entry:
-; CHECK:  ld.global.v2.u32
-; CHECK:  ld.global.v2.u32
-; CHECK:  ld.global.v2.u32
+; CHECK:  ld.global.v2.b32
+; CHECK:  ld.global.v2.b32
+; CHECK:  ld.global.v2.b32
   %tmp4 = load <2 x i32>, ptr addrspace(1) %def_a
   %tmp6 = load <2 x i32>, ptr addrspace(1) %def_c
   %tmp8 = load <2 x i32>, ptr addrspace(1) %def_b
@@ -21,7 +21,7 @@ entry:
 ; CHECK:  selp.b32
 ; CHECK:  selp.b32
   %cond = select <2 x i1> %0, <2 x i32> %tmp6, <2 x i32> %tmp8
-; CHECK:  st.global.v2.u32
+; CHECK:  st.global.v2.b32
   store <2 x i32> %cond, ptr addrspace(1) %def_c
   ret void
 }
diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll
index cbcaf5fc3822e..f3b1015070085 100644
--- a/llvm/test/CodeGen/NVPTX/vector-stores.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll
@@ -2,28 +2,28 @@
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 ; CHECK-LABEL: .visible .func foo1
-; CHECK: st.v2.f32
+; CHECK: st.v2.b32
 define void @foo1(<2 x float> %val, ptr %ptr) {
   store <2 x float> %val, ptr %ptr
   ret void
 }
 
 ; CHECK-LABEL: .visible .func foo2
-; CHECK: st.v4.f32
+; CHECK: st.v4.b32
 define void @foo2(<4 x float> %val, ptr %ptr) {
   store <4 x float> %val, ptr %ptr
   ret void
 }
 
 ; CHECK-LABEL: .visible .func foo3
-; CHECK: st.v2.u32
+; CHECK: st.v2.b32
 define void @foo3(<2 x i32> %val, ptr %ptr) {
   store <2 x i32> %val, ptr %ptr
   ret void
 }
 
 ; CHECK-LABEL: .visible .func foo4
-; CHECK: st.v4.u32
+; CHECK: st.v4.b32
 define void @foo4(<4 x i32> %val, ptr %ptr) {
   store <4 x i32> %val, ptr %ptr
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll b/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll
index ec9c38258c57a..b77f69fd717f7 100644
--- a/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll
+++ b/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll
@@ -4,10 +4,10 @@
 target triple = "nvptx64-nvidia-cuda"
 
 ; CHECK-LABEL: test1
-; CHECK: ld.global.v2.f32
-; CHECK: ld.global.v2.f32
-; CHECK: st.global.v2.f32
-; CHECK: st.global.v2.f32
+; CHECK: ld.global.v2.b32
+; CHECK: ld.global.v2.b32
+; CHECK: st.global.v2.b32
+; CHECK: st.global.v2.b32
 define void @test1(ptr addrspace(1) noalias align 8 %in, ptr addrspace(1) noalias align 8 %out) {
   %in.1 = getelementptr float, ptr addrspace(1) %in, i32 1
   %in.2 = getelementptr float, ptr addrspace(1) %in, i32 2

>From c9d08db69b2173488670ce40fdee4a90c0d074c9 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Wed, 30 Apr 2025 20:01:18 +0000
Subject: [PATCH 2/2] more test updates

---
 clang/test/CodeGenCUDA/bf16.cu                |  2 +-
 clang/test/CodeGenCUDA/fp-contract.cu         | 28 +++++++--------
 clang/test/CodeGenCUDA/memcpy-libcall.cu      | 28 +++++++--------
 llvm/test/DebugInfo/NVPTX/debug-info.ll       | 14 ++++----
 .../NaryReassociate/NVPTX/nary-slsr.ll        |  2 +-
 .../NVPTX/split-gep-and-gvn.ll                | 36 +++++++++----------
 .../NVPTX/reassociate-geps-and-slsr.ll        | 12 +++----
 .../NVPTX/speculative-slsr.ll                 |  4 +--
 .../Inputs/nvptx-basic.ll.expected            | 22 ++++++------
 9 files changed, 74 insertions(+), 74 deletions(-)

diff --git a/clang/test/CodeGenCUDA/bf16.cu b/clang/test/CodeGenCUDA/bf16.cu
index f794b83239f14..df56ec60c63ae 100644
--- a/clang/test/CodeGenCUDA/bf16.cu
+++ b/clang/test/CodeGenCUDA/bf16.cu
@@ -11,7 +11,7 @@
 // CHECK:        .param .align 2 .b8 _Z8test_argPDF16bDF16b_param_1[2]
 //
 __device__ void test_arg(__bf16 *out, __bf16 in) {
-// CHECK-DAG:     ld.param.u64  %[[A:rd[0-9]+]], [_Z8test_argPDF16bDF16b_param_0];
+// CHECK-DAG:     ld.param.b64  %[[A:rd[0-9]+]], [_Z8test_argPDF16bDF16b_param_0];
 // CHECK-DAG:     ld.param.b16  %[[R:rs[0-9]+]], [_Z8test_argPDF16bDF16b_param_1];
   __bf16 bf16 = in;
   *out = bf16;
diff --git a/clang/test/CodeGenCUDA/fp-contract.cu b/clang/test/CodeGenCUDA/fp-contract.cu
index 60824ba59ddfb..d6c796a817cbf 100644
--- a/clang/test/CodeGenCUDA/fp-contract.cu
+++ b/clang/test/CodeGenCUDA/fp-contract.cu
@@ -179,26 +179,26 @@
 __host__ __device__ float func(float a, float b, float c) { return a + b * c; }
 // COMMON-LABEL: _Z4funcfff
 // NV-ON:       fma.rn.f32
-// NV-ON-NEXT:  st.param.f32
+// NV-ON-NEXT:  st.param.b32
 // AMD-ON:       v_fmac_f32_e64
 // AMD-ON-NEXT:  s_setpc_b64
 
 // NV-OFF:      mul.rn.f32
 // NV-OFF-NEXT: add.rn.f32
-// NV-OFF-NEXT: st.param.f32
+// NV-OFF-NEXT: st.param.b32
 // AMD-OFF:      v_mul_f32_e64
 // AMD-OFF-NEXT: v_add_f32_e64
 // AMD-OFF-NEXT: s_setpc_b64
 
 // NV-OPT-FAST: fma.rn.f32
-// NV-OPT-FAST-NEXT: st.param.f32
+// NV-OPT-FAST-NEXT: st.param.b32
 // NV-OPT-FASTSTD: fma.rn.f32
-// NV-OPT-FASTSTD-NEXT: st.param.f32
+// NV-OPT-FASTSTD-NEXT: st.param.b32
 // NV-OPT-ON: fma.rn.f32
-// NV-OPT-ON-NEXT: st.param.f32
+// NV-OPT-ON-NEXT: st.param.b32
 // NV-OPT-OFF: mul.rn.f32
 // NV-OPT-OFF-NEXT: add.rn.f32
-// NV-OPT-OFF-NEXT: st.param.f32
+// NV-OPT-OFF-NEXT: st.param.b32
 
 // AMD-OPT-FAST-IR: fmul contract float
 // AMD-OPT-FAST-IR: fadd contract float
@@ -224,15 +224,15 @@ __host__ __device__ float func2(float a, float b, float c) {
 }
 // COMMON-LABEL: _Z5func2fff
 // NV-OPT-FAST: fma.rn.f32
-// NV-OPT-FAST-NEXT: st.param.f32
+// NV-OPT-FAST-NEXT: st.param.b32
 // NV-OPT-FASTSTD: fma.rn.f32
-// NV-OPT-FASTSTD-NEXT: st.param.f32
+// NV-OPT-FASTSTD-NEXT: st.param.b32
 // NV-OPT-ON: mul.rn.f32
 // NV-OPT-ON: add.rn.f32
-// NV-OPT-ON-NEXT: st.param.f32
+// NV-OPT-ON-NEXT: st.param.b32
 // NV-OPT-OFF: mul.rn.f32
 // NV-OPT-OFF: add.rn.f32
-// NV-OPT-OFF-NEXT: st.param.f32
+// NV-OPT-OFF-NEXT: st.param.b32
 
 // AMD-OPT-FAST-IR: fmul contract float
 // AMD-OPT-FAST-IR: fadd contract float
@@ -267,16 +267,16 @@ __host__ __device__ float func2(float a, float b, float c) {
 }
 // COMMON-LABEL: _Z5func3fff
 // NV-OPT-FAST: fma.rn.f32
-// NV-OPT-FAST-NEXT: st.param.f32
+// NV-OPT-FAST-NEXT: st.param.b32
 // NV-OPT-FASTSTD: mul.rn.f32
 // NV-OPT-FASTSTD: add.rn.f32
-// NV-OPT-FASTSTD-NEXT: st.param.f32
+// NV-OPT-FASTSTD-NEXT: st.param.b32
 // NV-OPT-ON: mul.rn.f32
 // NV-OPT-ON: add.rn.f32
-// NV-OPT-ON-NEXT: st.param.f32
+// NV-OPT-ON-NEXT: st.param.b32
 // NV-OPT-OFF: mul.rn.f32
 // NV-OPT-OFF: add.rn.f32
-// NV-OPT-OFF-NEXT: st.param.f32
+// NV-OPT-OFF-NEXT: st.param.b32
 
 // AMD-OPT-FAST-IR: fmul float
 // AMD-OPT-FAST-IR: fadd float
diff --git a/clang/test/CodeGenCUDA/memcpy-libcall.cu b/clang/test/CodeGenCUDA/memcpy-libcall.cu
index 1180767545b12..c20fa2faceb01 100644
--- a/clang/test/CodeGenCUDA/memcpy-libcall.cu
+++ b/clang/test/CodeGenCUDA/memcpy-libcall.cu
@@ -10,15 +10,15 @@
 // PTX-LABEL: .func _Z12copy_genericPvPKv(
 void __device__ copy_generic(void *dest, const void *src) {
   __builtin_memcpy(dest, src, 32);
-// PTX:        ld.u8
-// PTX:        st.u8
+// PTX:        ld.b8
+// PTX:        st.b8
 }
 
 // PTX-LABEL: .entry _Z11copy_globalPvS_(
 void __global__ copy_global(void *dest, void * src) {
   __builtin_memcpy(dest, src, 32);
-// PTX:        ld.global.u8
-// PTX:        st.global.u8
+// PTX:        ld.global.b8
+// PTX:        st.global.b8
 }
 
 struct S {
@@ -28,24 +28,24 @@ struct S {
 // PTX-LABEL: .entry _Z20copy_param_to_globalP1SS_(
 void __global__ copy_param_to_global(S *global, S param) {
   __builtin_memcpy(global, &param, sizeof(S));
-// PTX:        ld.param.u32
-// PTX:        st.global.u32
+// PTX:        ld.param.b32
+// PTX:        st.global.b32
 }
 
 // PTX-LABEL: .entry _Z19copy_param_to_localPU3AS51SS_(
 void __global__ copy_param_to_local(__attribute__((address_space(5))) S *local,
                                     S param) {
   __builtin_memcpy(local, &param, sizeof(S));
-// PTX:        ld.param.u32
-// PTX:        st.local.u32
+// PTX:        ld.param.b32
+// PTX:        st.local.b32
 }
 
 // PTX-LABEL: .func _Z21copy_local_to_genericP1SPU3AS5S_(
 void __device__ copy_local_to_generic(S *generic,
                                      __attribute__((address_space(5))) S *src) {
   __builtin_memcpy(generic, src, sizeof(S));
-// PTX:        ld.local.u32
-// PTX:        st.u32
+// PTX:        ld.local.b32
+// PTX:        st.b32
 }
 
 __shared__ S shared;
@@ -53,12 +53,12 @@ __shared__ S shared;
 // PTX-LABEL: .entry _Z20copy_param_to_shared1S(
 void __global__ copy_param_to_shared( S param) {
   __builtin_memcpy(&shared, &param, sizeof(S));
-// PTX:        ld.param.u32
-// PTX:        st.shared.u32
+// PTX:        ld.param.b32
+// PTX:        st.shared.b32
 }
 
 void __device__ copy_shared_to_generic(S *generic) {
   __builtin_memcpy(generic, &shared, sizeof(S));
-// PTX:        ld.shared.u32
-// PTX:        st.u32
+// PTX:        ld.shared.b32
+// PTX:        st.b32
 }
diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll
index 1fc945b364c93..2d8a17f7178a3 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-info.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll
@@ -24,10 +24,10 @@
 ; CHECK-DAG: .reg .b32       %r<6>;
 ; CHECK-DAG: .reg .b64       %rd<8>;
 ; CHECK: .loc [[DEBUG_INFO_CU:[0-9]+]] 5 0
-; CHECK: ld.param.u32    %r{{.+}}, [{{.+}}];
-; CHECK: ld.param.u64    %rd{{.+}}, [{{.+}}];
+; CHECK: ld.param.b32    %r{{.+}}, [{{.+}}];
+; CHECK: ld.param.b64    %rd{{.+}}, [{{.+}}];
 ; CHECK: cvta.to.global.u64      %rd{{.+}}, %rd{{.+}};
-; CHECK: ld.param.u64    %rd{{.+}}, [{{.+}}];
+; CHECK: ld.param.b64    %rd{{.+}}, [{{.+}}];
 ; CHECK: cvta.to.global.u64      %rd{{.+}}, %rd{{.+}};
 ; CHECK: .loc [[BUILTUIN_VARS_H:[0-9]+]] 78 180
 ; CHECK: mov.u32         %r{{.+}}, %ctaid.x;
@@ -41,18 +41,18 @@
 ; CHECK: setp.ge.s32     %p{{.+}}, %r{{.+}}, %r{{.+}};
 ; CHECK: .loc [[DEBUG_INFO_CU]] 7 7
 ; CHECK: @%p{{.+}} bra   [[BB:\$L__.+]];
-; CHECK: ld.param.f32    %f{{.+}}, [{{.+}}];
+; CHECK: ld.param.b32    %f{{.+}}, [{{.+}}];
 ; CHECK: .loc [[DEBUG_INFO_CU]] 8 13
 ; CHECK: mul.wide.u32    %rd{{.+}}, %r{{.+}}, 4;
 ; CHECK: add.s64         %rd{{.+}}, %rd{{.+}}, %rd{{.+}};
-; CHECK: ld.global.f32   %f{{.+}}, [%rd{{.+}}];
+; CHECK: ld.global.b32   %f{{.+}}, [%rd{{.+}}];
 ; CHECK: .loc [[DEBUG_INFO_CU]] 8 19
 ; CHECK: add.s64         %rd{{.+}}, %rd{{.+}}, %rd{{.+}};
-; CHECK: ld.global.f32   %f{{.+}}, [%rd{{.+}}];
+; CHECK: ld.global.b32   %f{{.+}}, [%rd{{.+}}];
 ; CHECK: .loc [[DEBUG_INFO_CU]] 3 82
 ; CHECK: fma.rn.f32      %f{{.+}}, %f{{.+}}, %f{{.+}}, %f{{.+}};
 ; CHECK: .loc [[DEBUG_INFO_CU]] 3 78
-; CHECK: st.global.f32   [%rd{{.+}}], %f{{.+}};
+; CHECK: st.global.b32   [%rd{{.+}}], %f{{.+}};
 ; CHECK: [[BB]]:
 ; CHECK: .loc [[DEBUG_INFO_CU]] 9 1
 ; CHECK: ret;
diff --git a/llvm/test/Transforms/NaryReassociate/NVPTX/nary-slsr.ll b/llvm/test/Transforms/NaryReassociate/NVPTX/nary-slsr.ll
index 8ec573c2e9ea9..bf5f33b03fce5 100644
--- a/llvm/test/Transforms/NaryReassociate/NVPTX/nary-slsr.ll
+++ b/llvm/test/Transforms/NaryReassociate/NVPTX/nary-slsr.ll
@@ -17,7 +17,7 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 define void @nary_reassociate_after_slsr(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: @nary_reassociate_after_slsr(
 ; PTX-LABEL: .visible .func nary_reassociate_after_slsr(
-; PTX: ld.param.u32 [[b:%r[0-9]+]], [nary_reassociate_after_slsr_param_1];
+; PTX: ld.param.b32 [[b:%r[0-9]+]], [nary_reassociate_after_slsr_param_1];
   %ab = add i32 %a, %b
   %abc = add i32 %ab, %c
   call void @foo(i32 %abc)
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
index 4474585bf9b06..11f9d7018a027 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
@@ -70,10 +70,10 @@ define void @sum_of_array(i32 %x, i32 %y, ptr nocapture %output) {
   ret void
 }
 ; PTX-LABEL: sum_of_array(
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]]
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+4]
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+4]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
 
 ; TODO: GVN is unable to preserve the "inbounds" keyword on the first GEP. Need
 ; some infrastructure changes to enable such optimizations.
@@ -134,10 +134,10 @@ define void @sum_of_array2(i32 %x, i32 %y, ptr nocapture %output) {
   ret void
 }
 ; PTX-LABEL: sum_of_array2(
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]]
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+4]
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+4]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
 
 
 
@@ -203,10 +203,10 @@ define void @sum_of_array3(i32 %x, i32 %y, ptr nocapture %output) {
   ret void
 }
 ; PTX-LABEL: sum_of_array3(
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]]
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+4]
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+4]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
 
 
 
@@ -268,10 +268,10 @@ define void @sum_of_array4(i32 %x, i32 %y, ptr nocapture %output) {
   ret void
 }
 ; PTX-LABEL: sum_of_array4(
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]]
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+4]
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
-; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+4]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+128]
+; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+132]
 
 
 
@@ -307,7 +307,7 @@ entry:
   %0 = sext i32 %xy to i64
   %p0 = getelementptr inbounds float, ptr %input, i64 %0
   %v0 = load float, ptr %p0, align 4
-; PTX: ld.f32 %f{{[0-9]+}}, [[[p0:%rd[0-9]+]]]
+; PTX: ld.b32 %f{{[0-9]+}}, [[[p0:%rd[0-9]+]]]
   call void @use(float %v0)
 
   %y5 = add nsw i32 %y, 5
@@ -315,7 +315,7 @@ entry:
   %1 = sext i32 %xy5 to i64
   %p1 = getelementptr inbounds float, ptr %input, i64 %1
   %v1 = load float, ptr %p1, align 4
-; PTX: ld.f32 %f{{[0-9]+}}, [[[p0]]+20]
+; PTX: ld.b32 %f{{[0-9]+}}, [[[p0]]+20]
   call void @use(float %v1)
 
   ret void
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll
index edaeef8c87b6c..11f2be7b763ca 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll
@@ -51,8 +51,8 @@ define void @slsr_after_reassociate_geps(ptr %arr, i32 %i) {
 ; CHECK-NEXT:    ret void
 ;
 ; PTX-LABEL: .visible .func slsr_after_reassociate_geps(
-; PTX: ld.param.u64 [[arr:%rd[0-9]+]], [slsr_after_reassociate_geps_param_0];
-; PTX: ld.param.u32 [[i:%r[0-9]+]], [slsr_after_reassociate_geps_param_1];
+; PTX: ld.param.b64 [[arr:%rd[0-9]+]], [slsr_after_reassociate_geps_param_0];
+; PTX: ld.param.b32 [[i:%r[0-9]+]], [slsr_after_reassociate_geps_param_1];
   %i2 = shl nsw i32 %i, 1
   %i3 = mul nsw i32 %i, 3
   %i4 = shl nsw i32 %i, 2
@@ -62,28 +62,28 @@ define void @slsr_after_reassociate_geps(ptr %arr, i32 %i) {
 ; PTX: mul.wide.s32 [[i4:%rd[0-9]+]], [[i]], 4;
 ; PTX: add.s64 [[base1:%rd[0-9]+]], [[arr]], [[i4]];
   %v1 = load float, ptr %p1, align 4
-; PTX: ld.f32 {{%f[0-9]+}}, [[[base1]]+20];
+; PTX: ld.b32 {{%f[0-9]+}}, [[[base1]]+20];
   call void @foo(float %v1)
 
   %j2 = add nsw i32 %i2, 5
   %p2 = getelementptr inbounds float, ptr %arr, i32 %j2
 ; PTX: add.s64 [[base2:%rd[0-9]+]], [[base1]], [[i4]];
   %v2 = load float, ptr %p2, align 4
-; PTX: ld.f32 {{%f[0-9]+}}, [[[base2]]+20];
+; PTX: ld.b32 {{%f[0-9]+}}, [[[base2]]+20];
   call void @foo(float %v2)
 
   %j3 = add nsw i32 %i3, 5
   %p3 = getelementptr inbounds float, ptr %arr, i32 %j3
 ; PTX: add.s64 [[base3:%rd[0-9]+]], [[base2]], [[i4]];
   %v3 = load float, ptr %p3, align 4
-; PTX: ld.f32 {{%f[0-9]+}}, [[[base3]]+20];
+; PTX: ld.b32 {{%f[0-9]+}}, [[[base3]]+20];
   call void @foo(float %v3)
 
   %j4 = add nsw i32 %i4, 5
   %p4 = getelementptr inbounds float, ptr %arr, i32 %j4
 ; PTX: add.s64 [[base4:%rd[0-9]+]], [[base3]], [[i4]];
   %v4 = load float, ptr %p4, align 4
-; PTX: ld.f32 {{%f[0-9]+}}, [[[base4]]+20];
+; PTX: ld.b32 {{%f[0-9]+}}, [[[base4]]+20];
   call void @foo(float %v4)
 
   ret void
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
index 420e844b51039..6d086c3ea55b9 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll
@@ -14,8 +14,8 @@ target triple = "nvptx64-nvidia-cuda"
 define ptx_kernel void @foo(i32 %b, i32 %s) {
 ; CHECK-LABEL: .visible .entry foo(
 entry:
-; CHECK: ld.param.u32 [[s:%r[0-9]+]], [foo_param_1];
-; CHECK: ld.param.u32 [[b:%r[0-9]+]], [foo_param_0];
+; CHECK: ld.param.b32 [[s:%r[0-9]+]], [foo_param_1];
+; CHECK: ld.param.b32 [[b:%r[0-9]+]], [foo_param_0];
   %call = tail call zeroext i1 @cond(i32 0)
   br i1 %call, label %if.then, label %for.inc
 
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
index ad0b11ed6a806..b8779b9d54ea7 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
@@ -10,10 +10,10 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct
 ; CHECK-NEXT:    .reg .b64 %rd<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [caller_St8x4_param_0+8];
-; CHECK-NEXT:    ld.param.u64 %rd2, [caller_St8x4_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd3, [caller_St8x4_param_0+24];
-; CHECK-NEXT:    ld.param.u64 %rd4, [caller_St8x4_param_0+16];
+; CHECK-NEXT:    ld.param.b64 %rd1, [caller_St8x4_param_0+8];
+; CHECK-NEXT:    ld.param.b64 %rd2, [caller_St8x4_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd3, [caller_St8x4_param_0+24];
+; CHECK-NEXT:    ld.param.b64 %rd4, [caller_St8x4_param_0+16];
 ; CHECK-NEXT:    { // callseq 0, 0
 ; CHECK-NEXT:    .param .align 16 .b8 param0[32];
 ; CHECK-NEXT:    st.param.v2.b64 [param0], {%rd2, %rd1};
@@ -27,11 +27,11 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [retval0];
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd7, %rd8}, [retval0+16];
 ; CHECK-NEXT:    } // callseq 0
-; CHECK-NEXT:    ld.param.u32 %r2, [caller_St8x4_param_1];
-; CHECK-NEXT:    st.u64 [%r2], %rd5;
-; CHECK-NEXT:    st.u64 [%r2+8], %rd6;
-; CHECK-NEXT:    st.u64 [%r2+16], %rd7;
-; CHECK-NEXT:    st.u64 [%r2+24], %rd8;
+; CHECK-NEXT:    ld.param.b32 %r2, [caller_St8x4_param_1];
+; CHECK-NEXT:    st.b64 [%r2], %rd5;
+; CHECK-NEXT:    st.b64 [%r2+8], %rd6;
+; CHECK-NEXT:    st.b64 [%r2+16], %rd7;
+; CHECK-NEXT:    st.b64 [%r2+24], %rd8;
 ; CHECK-NEXT:    ret;
   %call = tail call fastcc [4 x i64] @callee_St8x4(ptr noundef nonnull byval(%struct.St8x4) align 8 %in) #2
   %.fca.0.extract = extractvalue [4 x i64] %call, 0
@@ -56,8 +56,8 @@ define internal fastcc [4 x i64] @callee_St8x4(ptr nocapture noundef readonly by
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [callee_St8x4_param_0];
-; CHECK-NEXT:    ld.param.v2.u64 {%rd3, %rd4}, [callee_St8x4_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [callee_St8x4_param_0];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [callee_St8x4_param_0+16];
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd2};
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
 ; CHECK-NEXT:    ret;



More information about the cfe-commits mailing list