[llvm] [NVPTX] Add intrinsic support for specialized prmt variants (PR #140951)

Wed May 21 12:02:21 PDT 2025

https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/140951

>From 43c4c905c796844cc75f534a920f237f18926dbd Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Wed, 14 May 2025 22:01:16 +0000
Subject: [PATCH] [NVPTX] Add support for specialized prmt variants

---
 llvm/docs/NVPTXUsage.rst                 | 120 +++++++++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td   |  20 +++-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td  |  56 ++++++-----
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td |  19 +++-
 llvm/test/CodeGen/NVPTX/bswap.ll         |  46 +++++----
 llvm/test/CodeGen/NVPTX/prmt.ll          | 113 +++++++++++++++++++++
 6 files changed, 326 insertions(+), 48 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/prmt.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 07750579f5a5e..60cc055089a01 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -661,6 +661,126 @@ all bits set to 0 except for %b bits starting at bit position %a. For the
 '``clamp``' variants, the values of %a and %b are clamped to the range [0, 32],
 which in practice is equivalent to using them as is.
 
+'``llvm.nvvm.prmt``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+    declare i32 @llvm.nvvm.prmt(i32 %a, i32 %b, i32 %c)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.prmt``' constructs a permutation of the bytes of the first two
+operands, selecting based on the third operand.
+
+Semantics:
+""""""""""
+
+The bytes in the first two source operands are numbered from 0 to 7:
+{%b, %a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each byte in the target 
+register, a 4-bit selection value is defined.
+
+The 3 lsbs of the selection value specify which of the 8 source bytes should be
+moved into the target position. The msb defines if the byte value should be
+copied, or if the sign (msb of the byte) should be replicated over all 8 bits
+of the target position (sign extend of the byte value); msb=0 means copy the
+literal value; msb=1 means replicate the sign.
+
+These 4-bit selection values are pulled from the lower 16-bits of the third
+operand, with the least significant selection value corresponding to the least
+significant byte of the destination.
+
+
+'``llvm.nvvm.prmt.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+    declare i32 @llvm.nvvm.prmt.f4e(i32 %a, i32 %b, i32 %c)
+    declare i32 @llvm.nvvm.prmt.b4e(i32 %a, i32 %b, i32 %c)
+
+    declare i32 @llvm.nvvm.prmt.rc8(i32 %a, i32 %c)
+    declare i32 @llvm.nvvm.prmt.ecl(i32 %a, i32 %c)
+    declare i32 @llvm.nvvm.prmt.ecr(i32 %a, i32 %c)
+    declare i32 @llvm.nvvm.prmt.rc16(i32 %a, i32 %c)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.prmt.*``' family of intrinsics constructs a permutation of the
+bytes of the first one or two operands, selecting based on the 2 least
+significant bits of the final operand.
+
+Semantics:
+""""""""""
+
+As with the generic '``llvm.nvvm.prmt``' intrinsic, the bytes in the first one
+or two source operands are numbered. The first source operand (%a) is numbered
+{b3, b2, b1, b0}, in the case of the '``f4e``' and '``b4e``' variants, the
+second source operand (%b) is numbered {b7, b6, b5, b4}.
+
+Depending on the 2 least significant bits of the final operand, the result of
+the permutation is defined as follows:
+
++------------+---------+--------------+
+|    Mode    | %c[1:0] |    Output    |
++------------+---------+--------------+
+| '``f4e``'  |   0     | {3, 2, 1, 0} |
+|            +---------+--------------+
+|            |   1     | {4, 3, 2, 1} |
+|            +---------+--------------+
+|            |   2     | {5, 4, 3, 2} |
+|            +---------+--------------+
+|            |   3     | {6, 5, 4, 3} |
++------------+---------+--------------+
+| '``b4e``'  |   0     | {5, 6, 7, 0} |
+|            +---------+--------------+
+|            |   1     | {6, 7, 0, 1} |
+|            +---------+--------------+
+|            |   2     | {7, 0, 1, 2} |
+|            +---------+--------------+
+|            |   3     | {0, 1, 2, 3} |
++------------+---------+--------------+
+| '``rc8``'  |   0     | {0, 0, 0, 0} |
+|            +---------+--------------+
+|            |   1     | {1, 1, 1, 1} |
+|            +---------+--------------+
+|            |   2     | {2, 2, 2, 2} |
+|            +---------+--------------+
+|            |   3     | {3, 3, 3, 3} |
++------------+---------+--------------+
+| '``ecl``'  |   0     | {3, 2, 1, 0} |
+|            +---------+--------------+
+|            |   1     | {3, 2, 1, 1} |
+|            +---------+--------------+
+|            |   2     | {3, 2, 2, 2} |
+|            +---------+--------------+
+|            |   3     | {3, 3, 3, 3} |
++------------+---------+--------------+
+| '``ecr``'  |   0     | {0, 0, 0, 0} |
+|            +---------+--------------+
+|            |   1     | {1, 1, 1, 0} |
+|            +---------+--------------+
+|            |   2     | {2, 2, 1, 0} |
+|            +---------+--------------+
+|            |   3     | {3, 2, 1, 0} |
++------------+---------+--------------+
+| '``rc16``' |   0     | {1, 0, 1, 0} |
+|            +---------+--------------+
+|            |   1     | {3, 2, 3, 2} |
+|            +---------+--------------+
+|            |   2     | {1, 0, 1, 0} |
+|            +---------+--------------+
+|            |   3     | {3, 2, 3, 2} |
++------------+---------+--------------+
+
 TMA family of Intrinsics
 ------------------------
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index aaa24fa43252f..95096672baf74 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -745,9 +745,23 @@ class NVVMBuiltin :
 }
 
 let TargetPrefix = "nvvm" in {
-  def int_nvvm_prmt : NVVMBuiltin,
-      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-        [IntrNoMem, IntrSpeculatable]>;
+
+  // PRMT - permute
+
+  let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+    def int_nvvm_prmt : NVVMBuiltin,
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+
+    foreach mode = ["f4e", "b4e"] in
+      def int_nvvm_prmt_ # mode :
+          DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+
+    // Note: these variants also have 2 source operands but only one will ever
+    // be used so we eliminate the other operand in the IR.
+    foreach mode = ["rc8", "ecl", "ecr", "rc16"] in
+      def int_nvvm_prmt_ # mode :
+          DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>;
+  }
 
   def int_nvvm_nanosleep : NVVMBuiltin,
       DefaultAttrsIntrinsic<[], [llvm_i32_ty],
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 5234fb0806189..8ff5f58f3a699 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1622,24 +1622,6 @@ def Hexu32imm : Operand<i32> {
   let PrintMethod = "printHexu32imm";
 }
 
-multiclass PRMT<ValueType T, RegisterClass RC> {
-  def rrr
-    : NVPTXInst<(outs RC:$d),
-                (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode),
-                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
-                [(set T:$d, (prmt T:$a, T:$b, i32:$c, imm:$mode))]>;
-  def rri
-    : NVPTXInst<(outs RC:$d),
-                (ins RC:$a, Int32Regs:$b, Hexu32imm:$c, PrmtMode:$mode),
-                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
-                [(set T:$d, (prmt T:$a, T:$b, imm:$c, imm:$mode))]>;
-  def rii
-    : NVPTXInst<(outs RC:$d),
-                (ins RC:$a, i32imm:$b, Hexu32imm:$c, PrmtMode:$mode),
-                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
-                [(set T:$d, (prmt T:$a, imm:$b, imm:$c, imm:$mode))]>;
-}
-
 let hasSideEffects = false in {
   // order is somewhat important here. signed/unsigned variants match
   // the same patterns, so the first one wins. Having unsigned byte extraction
@@ -1653,7 +1635,31 @@ let hasSideEffects = false in {
   defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>;
   defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>;
 
-  defm PRMT_B32 : PRMT<i32, Int32Regs>;
+  def PRMT_B32rrr
+    : BasicFlagsNVPTXInst<(outs Int32Regs:$d),
+                (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+                (ins PrmtMode:$mode),
+                "prmt.b32$mode",
+                [(set i32:$d, (prmt i32:$a, i32:$b, i32:$c, imm:$mode))]>;
+  def PRMT_B32rri
+    : BasicFlagsNVPTXInst<(outs Int32Regs:$d),
+                (ins Int32Regs:$a, Int32Regs:$b, Hexu32imm:$c),
+                (ins PrmtMode:$mode),
+                "prmt.b32$mode",
+                [(set i32:$d, (prmt i32:$a, i32:$b, imm:$c, imm:$mode))]>;
+  def PRMT_B32rii
+    : BasicFlagsNVPTXInst<(outs Int32Regs:$d),
+                (ins Int32Regs:$a, i32imm:$b, Hexu32imm:$c),
+                (ins PrmtMode:$mode),
+                "prmt.b32$mode",
+                [(set i32:$d, (prmt i32:$a, imm:$b, imm:$c, imm:$mode))]>;
+  def PRMT_B32rir
+    : BasicFlagsNVPTXInst<(outs Int32Regs:$d),
+                (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
+                (ins PrmtMode:$mode),
+                "prmt.b32$mode",
+                [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>;
+
 }
 
 
@@ -3306,25 +3312,25 @@ include "NVPTXIntrinsics.td"
 
 def : Pat <
   (i32 (bswap i32:$a)),
-  (INT_NVVM_PRMT $a, (i32 0), (i32 0x0123))>;
+  (PRMT_B32rii $a, (i32 0), (i32 0x0123), PrmtNONE)>;
 
 def : Pat <
   (v2i16 (bswap v2i16:$a)),
-  (INT_NVVM_PRMT $a, (i32 0), (i32 0x2301))>;
+  (PRMT_B32rii $a, (i32 0), (i32 0x2301), PrmtNONE)>;
 
 def : Pat <
   (i64 (bswap i64:$a)),
   (V2I32toI64
-    (INT_NVVM_PRMT (I64toI32H_Sink $a), (i32 0), (i32 0x0123)),
-    (INT_NVVM_PRMT (I64toI32L_Sink $a), (i32 0), (i32 0x0123)))>,
+    (PRMT_B32rii (I64toI32H_Sink $a), (i32 0), (i32 0x0123), PrmtNONE),
+    (PRMT_B32rii (I64toI32L_Sink $a), (i32 0), (i32 0x0123), PrmtNONE))>,
   Requires<[hasPTX<71>]>;
 
 // Fall back to the old way if we don't have PTX 7.1.
 def : Pat <
   (i64 (bswap i64:$a)),
   (V2I32toI64
-    (INT_NVVM_PRMT (I64toI32H $a), (i32 0), (i32 0x0123)),
-    (INT_NVVM_PRMT (I64toI32L $a), (i32 0), (i32 0x0123)))>;
+    (PRMT_B32rii (I64toI32H $a), (i32 0), (i32 0x0123), PrmtNONE),
+    (PRMT_B32rii (I64toI32L $a), (i32 0), (i32 0x0123), PrmtNONE))>;
 
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index f7b8aca0f77d8..95ffa5a04616a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1028,8 +1028,23 @@ class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
 // MISC
 //
 
-def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
-  Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>;
+class PRMT3Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode>
+    : Pat<(prmt_intrinsic i32:$a, i32:$b, i32:$c),
+          (PRMT_B32rrr $a, $b, $c, prmt_mode)>;
+
+class PRMT2Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode>
+    : Pat<(prmt_intrinsic i32:$a, i32:$c),
+          (PRMT_B32rir $a, (i32 0), $c, prmt_mode)>;
+
+def : PRMT3Pat<int_nvvm_prmt,      PrmtNONE>;
+def : PRMT3Pat<int_nvvm_prmt_f4e,  PrmtF4E>;
+def : PRMT3Pat<int_nvvm_prmt_b4e,  PrmtB4E>;
+
+def : PRMT2Pat<int_nvvm_prmt_rc8,  PrmtRC8>;
+def : PRMT2Pat<int_nvvm_prmt_ecl,  PrmtECL>;
+def : PRMT2Pat<int_nvvm_prmt_ecr,  PrmtECR>;
+def : PRMT2Pat<int_nvvm_prmt_rc16, PrmtRC16>;
+
 
 def INT_NVVM_NANOSLEEP_I : NVPTXInst<(outs), (ins i32imm:$i), "nanosleep.u32 \t$i;",
                              [(int_nvvm_nanosleep imm:$i)]>,
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
index 0e16682641edd..0d1d6da4ba2b6 100644
--- a/llvm/test/CodeGen/NVPTX/bswap.ll
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -33,7 +33,7 @@ define i32 @bswap32(i32 %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [bswap32_param_0];
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 291;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x123U;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = tail call i32 @llvm.bswap.i32(i32 %a)
@@ -48,7 +48,7 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [bswapv2i16_param_0];
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 8961;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x2301U;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a)
@@ -56,25 +56,35 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
 }
 
 define i64 @bswap64(i64 %a) {
-; CHECK-LABEL: bswap64(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd1, [bswap64_param_0];
+; PTX70-LABEL: bswap64(
+; PTX70:       {
+; PTX70-NEXT:    .reg .b32 %r<5>;
+; PTX70-NEXT:    .reg .b64 %rd<3>;
+; PTX70-EMPTY:
+; PTX70-NEXT:  // %bb.0:
+; PTX70-NEXT:    ld.param.b64 %rd1, [bswap64_param_0];
 ; PTX70-NEXT:    { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
-; PTX70-NEXT:    prmt.b32 %r2, %r1, 0, 291;
+; PTX70-NEXT:    prmt.b32 %r2, %r1, 0, 0x123U;
 ; PTX70-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
-; PTX70-NEXT:    prmt.b32 %r4, %r3, 0, 291;
+; PTX70-NEXT:    prmt.b32 %r4, %r3, 0, 0x123U;
 ; PTX70-NEXT:    mov.b64 %rd2, {%r4, %r2};
-; PTX71-NEXT:    mov.b64         {%r1, _}, %rd1;
-; PTX71-NEXT:    prmt.b32        %r2, %r1, 0, 291;
-; PTX71-NEXT:    mov.b64         {_, %r3}, %rd1;
-; PTX71-NEXT:    prmt.b32        %r4, %r3, 0, 291;
-; PTX71-NEXT:    mov.b64         %rd2, {%r4, %r2};
-; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
-; CHECK-NEXT:    ret;
+; PTX70-NEXT:    st.param.b64 [func_retval0], %rd2;
+; PTX70-NEXT:    ret;
+;
+; PTX71-LABEL: bswap64(
+; PTX71:       {
+; PTX71-NEXT:    .reg .b32 %r<5>;
+; PTX71-NEXT:    .reg .b64 %rd<3>;
+; PTX71-EMPTY:
+; PTX71-NEXT:  // %bb.0:
+; PTX71-NEXT:    ld.param.b64 %rd1, [bswap64_param_0];
+; PTX71-NEXT:    mov.b64 {%r1, _}, %rd1;
+; PTX71-NEXT:    prmt.b32 %r2, %r1, 0, 0x123U;
+; PTX71-NEXT:    mov.b64 {_, %r3}, %rd1;
+; PTX71-NEXT:    prmt.b32 %r4, %r3, 0, 0x123U;
+; PTX71-NEXT:    mov.b64 %rd2, {%r4, %r2};
+; PTX71-NEXT:    st.param.b64 [func_retval0], %rd2;
+; PTX71-NEXT:    ret;
   %b = tail call i64 @llvm.bswap.i64(i64 %a)
   ret i64 %b
 }
diff --git a/llvm/test/CodeGen/NVPTX/prmt.ll b/llvm/test/CodeGen/NVPTX/prmt.ll
new file mode 100644
index 0000000000000..b0ad337a1977f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/prmt.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -verify-machineinstrs | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_prmt_basic(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: test_prmt_basic(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_prmt_basic_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_prmt_basic_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_prmt_basic_param_2];
+; CHECK-NEXT:    prmt.b32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+    %val = call i32 @llvm.nvvm.prmt(i32 %a, i32 %b, i32 %c)
+    ret i32 %val
+}
+
+define i32 @test_prmt_f4e(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: test_prmt_f4e(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_prmt_f4e_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_prmt_f4e_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_prmt_f4e_param_2];
+; CHECK-NEXT:    prmt.b32.f4e %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.prmt.f4e(i32 %a, i32 %b, i32 %c)
+  ret i32 %val
+}
+
+define i32 @test_prmt_b4e(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: test_prmt_b4e(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_prmt_b4e_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_prmt_b4e_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_prmt_b4e_param_2];
+; CHECK-NEXT:    prmt.b32.b4e %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.prmt.b4e(i32 %a, i32 %b, i32 %c)
+  ret i32 %val
+}
+
+define i32 @test_prmt_rc8(i32 %a, i32 %b) {
+; CHECK-LABEL: test_prmt_rc8(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_prmt_rc8_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_prmt_rc8_param_1];
+; CHECK-NEXT:    prmt.b32.rc8 %r3, %r1, 0, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.prmt.rc8(i32 %a, i32 %b)
+  ret i32 %val
+}
+
+define i32 @test_prmt_ecl(i32 %a, i32 %b) {
+; CHECK-LABEL: test_prmt_ecl(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_prmt_ecl_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_prmt_ecl_param_1];
+; CHECK-NEXT:    prmt.b32.ecl %r3, %r1, 0, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.prmt.ecl(i32 %a, i32 %b)
+  ret i32 %val
+}
+
+define i32 @test_prmt_ecr(i32 %a, i32 %b) {
+; CHECK-LABEL: test_prmt_ecr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_prmt_ecr_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_prmt_ecr_param_1];
+; CHECK-NEXT:    prmt.b32.ecr %r3, %r1, 0, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.prmt.ecr(i32 %a, i32 %b)
+  ret i32 %val
+}
+
+define i32 @test_prmt_rc16(i32 %a, i32 %b) {
+; CHECK-LABEL: test_prmt_rc16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_prmt_rc16_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_prmt_rc16_param_1];
+; CHECK-NEXT:    prmt.b32.rc16 %r3, %r1, 0, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.prmt.rc16(i32 %a, i32 %b)
+  ret i32 %val
+}