[llvm] [NVPTX] Add intrinsic support for specialized prmt variants (PR #140951)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Wed May 21 12:02:21 PDT 2025
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/140951
>From 43c4c905c796844cc75f534a920f237f18926dbd Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Wed, 14 May 2025 22:01:16 +0000
Subject: [PATCH] [NVPTX] Add support for specialized prmt variants
---
llvm/docs/NVPTXUsage.rst | 120 +++++++++++++++++++++++
llvm/include/llvm/IR/IntrinsicsNVVM.td | 20 +++-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 56 ++++++-----
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 19 +++-
llvm/test/CodeGen/NVPTX/bswap.ll | 46 +++++----
llvm/test/CodeGen/NVPTX/prmt.ll | 113 +++++++++++++++++++++
6 files changed, 326 insertions(+), 48 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/prmt.ll
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 07750579f5a5e..60cc055089a01 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -661,6 +661,126 @@ all bits set to 0 except for %b bits starting at bit position %a. For the
'``clamp``' variants, the values of %a and %b are clamped to the range [0, 32],
which in practice is equivalent to using them as is.
+'``llvm.nvvm.prmt``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare i32 @llvm.nvvm.prmt(i32 %a, i32 %b, i32 %c)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.prmt``' constructs a permutation of the bytes of the first two
+operands, selecting based on the third operand.
+
+Semantics:
+""""""""""
+
+The bytes in the first two source operands are numbered from 0 to 7:
+{%b, %a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each byte in the target
+register, a 4-bit selection value is defined.
+
+The 3 lsbs of the selection value specify which of the 8 source bytes should be
+moved into the target position. The msb defines if the byte value should be
+copied, or if the sign (msb of the byte) should be replicated over all 8 bits
+of the target position (sign extend of the byte value); msb=0 means copy the
+literal value; msb=1 means replicate the sign.
+
+These 4-bit selection values are pulled from the lower 16-bits of the third
+operand, with the least significant selection value corresponding to the least
+significant byte of the destination.
+
+
+'``llvm.nvvm.prmt.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare i32 @llvm.nvvm.prmt.f4e(i32 %a, i32 %b, i32 %c)
+ declare i32 @llvm.nvvm.prmt.b4e(i32 %a, i32 %b, i32 %c)
+
+ declare i32 @llvm.nvvm.prmt.rc8(i32 %a, i32 %c)
+ declare i32 @llvm.nvvm.prmt.ecl(i32 %a, i32 %c)
+ declare i32 @llvm.nvvm.prmt.ecr(i32 %a, i32 %c)
+ declare i32 @llvm.nvvm.prmt.rc16(i32 %a, i32 %c)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.prmt.*``' family of intrinsics constructs a permutation of the
+bytes of the first one or two operands, selecting based on the 2 least
+significant bits of the final operand.
+
+Semantics:
+""""""""""
+
+As with the generic '``llvm.nvvm.prmt``' intrinsic, the bytes in the first one
+or two source operands are numbered. The first source operand (%a) is numbered
+{b3, b2, b1, b0}, in the case of the '``f4e``' and '``b4e``' variants, the
+second source operand (%b) is numbered {b7, b6, b5, b4}.
+
+Depending on the 2 least significant bits of the final operand, the result of
+the permutation is defined as follows:
+
++------------+---------+--------------+
+| Mode | %c[1:0] | Output |
++------------+---------+--------------+
+| '``f4e``' | 0 | {3, 2, 1, 0} |
+| +---------+--------------+
+| | 1 | {4, 3, 2, 1} |
+| +---------+--------------+
+| | 2 | {5, 4, 3, 2} |
+| +---------+--------------+
+| | 3 | {6, 5, 4, 3} |
++------------+---------+--------------+
+| '``b4e``' | 0 | {5, 6, 7, 0} |
+| +---------+--------------+
+| | 1 | {6, 7, 0, 1} |
+| +---------+--------------+
+| | 2 | {7, 0, 1, 2} |
+| +---------+--------------+
+| | 3 | {0, 1, 2, 3} |
++------------+---------+--------------+
+| '``rc8``' | 0 | {0, 0, 0, 0} |
+| +---------+--------------+
+| | 1 | {1, 1, 1, 1} |
+| +---------+--------------+
+| | 2 | {2, 2, 2, 2} |
+| +---------+--------------+
+| | 3 | {3, 3, 3, 3} |
++------------+---------+--------------+
+| '``ecl``' | 0 | {3, 2, 1, 0} |
+| +---------+--------------+
+| | 1 | {3, 2, 1, 1} |
+| +---------+--------------+
+| | 2 | {3, 2, 2, 2} |
+| +---------+--------------+
+| | 3 | {3, 3, 3, 3} |
++------------+---------+--------------+
+| '``ecr``' | 0 | {0, 0, 0, 0} |
+| +---------+--------------+
+| | 1 | {1, 1, 1, 0} |
+| +---------+--------------+
+| | 2 | {2, 2, 1, 0} |
+| +---------+--------------+
+| | 3 | {3, 2, 1, 0} |
++------------+---------+--------------+
+| '``rc16``' | 0 | {1, 0, 1, 0} |
+| +---------+--------------+
+| | 1 | {3, 2, 3, 2} |
+| +---------+--------------+
+| | 2 | {1, 0, 1, 0} |
+| +---------+--------------+
+| | 3 | {3, 2, 3, 2} |
++------------+---------+--------------+
+
TMA family of Intrinsics
------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index aaa24fa43252f..95096672baf74 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -745,9 +745,23 @@ class NVVMBuiltin :
}
let TargetPrefix = "nvvm" in {
- def int_nvvm_prmt : NVVMBuiltin,
- DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrSpeculatable]>;
+
+ // PRMT - permute
+
+ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+ def int_nvvm_prmt : NVVMBuiltin,
+ DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+
+ foreach mode = ["f4e", "b4e"] in
+ def int_nvvm_prmt_ # mode :
+ DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+
+ // Note: these variants also have 2 source operands but only one will ever
+ // be used so we eliminate the other operand in the IR.
+ foreach mode = ["rc8", "ecl", "ecr", "rc16"] in
+ def int_nvvm_prmt_ # mode :
+ DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>;
+ }
def int_nvvm_nanosleep : NVVMBuiltin,
DefaultAttrsIntrinsic<[], [llvm_i32_ty],
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 5234fb0806189..8ff5f58f3a699 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1622,24 +1622,6 @@ def Hexu32imm : Operand<i32> {
let PrintMethod = "printHexu32imm";
}
-multiclass PRMT<ValueType T, RegisterClass RC> {
- def rrr
- : NVPTXInst<(outs RC:$d),
- (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode),
- !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
- [(set T:$d, (prmt T:$a, T:$b, i32:$c, imm:$mode))]>;
- def rri
- : NVPTXInst<(outs RC:$d),
- (ins RC:$a, Int32Regs:$b, Hexu32imm:$c, PrmtMode:$mode),
- !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
- [(set T:$d, (prmt T:$a, T:$b, imm:$c, imm:$mode))]>;
- def rii
- : NVPTXInst<(outs RC:$d),
- (ins RC:$a, i32imm:$b, Hexu32imm:$c, PrmtMode:$mode),
- !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
- [(set T:$d, (prmt T:$a, imm:$b, imm:$c, imm:$mode))]>;
-}
-
let hasSideEffects = false in {
// order is somewhat important here. signed/unsigned variants match
// the same patterns, so the first one wins. Having unsigned byte extraction
@@ -1653,7 +1635,31 @@ let hasSideEffects = false in {
defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>;
defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>;
- defm PRMT_B32 : PRMT<i32, Int32Regs>;
+ def PRMT_B32rrr
+ : BasicFlagsNVPTXInst<(outs Int32Regs:$d),
+ (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+ (ins PrmtMode:$mode),
+ "prmt.b32$mode",
+ [(set i32:$d, (prmt i32:$a, i32:$b, i32:$c, imm:$mode))]>;
+ def PRMT_B32rri
+ : BasicFlagsNVPTXInst<(outs Int32Regs:$d),
+ (ins Int32Regs:$a, Int32Regs:$b, Hexu32imm:$c),
+ (ins PrmtMode:$mode),
+ "prmt.b32$mode",
+ [(set i32:$d, (prmt i32:$a, i32:$b, imm:$c, imm:$mode))]>;
+ def PRMT_B32rii
+ : BasicFlagsNVPTXInst<(outs Int32Regs:$d),
+ (ins Int32Regs:$a, i32imm:$b, Hexu32imm:$c),
+ (ins PrmtMode:$mode),
+ "prmt.b32$mode",
+ [(set i32:$d, (prmt i32:$a, imm:$b, imm:$c, imm:$mode))]>;
+ def PRMT_B32rir
+ : BasicFlagsNVPTXInst<(outs Int32Regs:$d),
+ (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
+ (ins PrmtMode:$mode),
+ "prmt.b32$mode",
+ [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>;
+
}
@@ -3306,25 +3312,25 @@ include "NVPTXIntrinsics.td"
def : Pat <
(i32 (bswap i32:$a)),
- (INT_NVVM_PRMT $a, (i32 0), (i32 0x0123))>;
+ (PRMT_B32rii $a, (i32 0), (i32 0x0123), PrmtNONE)>;
def : Pat <
(v2i16 (bswap v2i16:$a)),
- (INT_NVVM_PRMT $a, (i32 0), (i32 0x2301))>;
+ (PRMT_B32rii $a, (i32 0), (i32 0x2301), PrmtNONE)>;
def : Pat <
(i64 (bswap i64:$a)),
(V2I32toI64
- (INT_NVVM_PRMT (I64toI32H_Sink $a), (i32 0), (i32 0x0123)),
- (INT_NVVM_PRMT (I64toI32L_Sink $a), (i32 0), (i32 0x0123)))>,
+ (PRMT_B32rii (I64toI32H_Sink $a), (i32 0), (i32 0x0123), PrmtNONE),
+ (PRMT_B32rii (I64toI32L_Sink $a), (i32 0), (i32 0x0123), PrmtNONE))>,
Requires<[hasPTX<71>]>;
// Fall back to the old way if we don't have PTX 7.1.
def : Pat <
(i64 (bswap i64:$a)),
(V2I32toI64
- (INT_NVVM_PRMT (I64toI32H $a), (i32 0), (i32 0x0123)),
- (INT_NVVM_PRMT (I64toI32L $a), (i32 0), (i32 0x0123)))>;
+ (PRMT_B32rii (I64toI32H $a), (i32 0), (i32 0x0123), PrmtNONE),
+ (PRMT_B32rii (I64toI32L $a), (i32 0), (i32 0x0123), PrmtNONE))>;
////////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index f7b8aca0f77d8..95ffa5a04616a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1028,8 +1028,23 @@ class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
// MISC
//
-def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
- Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>;
+class PRMT3Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode>
+ : Pat<(prmt_intrinsic i32:$a, i32:$b, i32:$c),
+ (PRMT_B32rrr $a, $b, $c, prmt_mode)>;
+
+class PRMT2Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode>
+ : Pat<(prmt_intrinsic i32:$a, i32:$c),
+ (PRMT_B32rir $a, (i32 0), $c, prmt_mode)>;
+
+def : PRMT3Pat<int_nvvm_prmt, PrmtNONE>;
+def : PRMT3Pat<int_nvvm_prmt_f4e, PrmtF4E>;
+def : PRMT3Pat<int_nvvm_prmt_b4e, PrmtB4E>;
+
+def : PRMT2Pat<int_nvvm_prmt_rc8, PrmtRC8>;
+def : PRMT2Pat<int_nvvm_prmt_ecl, PrmtECL>;
+def : PRMT2Pat<int_nvvm_prmt_ecr, PrmtECR>;
+def : PRMT2Pat<int_nvvm_prmt_rc16, PrmtRC16>;
+
def INT_NVVM_NANOSLEEP_I : NVPTXInst<(outs), (ins i32imm:$i), "nanosleep.u32 \t$i;",
[(int_nvvm_nanosleep imm:$i)]>,
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
index 0e16682641edd..0d1d6da4ba2b6 100644
--- a/llvm/test/CodeGen/NVPTX/bswap.ll
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -33,7 +33,7 @@ define i32 @bswap32(i32 %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [bswap32_param_0];
-; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 291;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x123U;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%b = tail call i32 @llvm.bswap.i32(i32 %a)
@@ -48,7 +48,7 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [bswapv2i16_param_0];
-; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 8961;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x2301U;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%b = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a)
@@ -56,25 +56,35 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
}
define i64 @bswap64(i64 %a) {
-; CHECK-LABEL: bswap64(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
+; PTX70-LABEL: bswap64(
+; PTX70: {
+; PTX70-NEXT: .reg .b32 %r<5>;
+; PTX70-NEXT: .reg .b64 %rd<3>;
+; PTX70-EMPTY:
+; PTX70-NEXT: // %bb.0:
+; PTX70-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
-; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 291;
+; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 0x123U;
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
-; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 291;
+; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 0x123U;
; PTX70-NEXT: mov.b64 %rd2, {%r4, %r2};
-; PTX71-NEXT: mov.b64 {%r1, _}, %rd1;
-; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 291;
-; PTX71-NEXT: mov.b64 {_, %r3}, %rd1;
-; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 291;
-; PTX71-NEXT: mov.b64 %rd2, {%r4, %r2};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
-; CHECK-NEXT: ret;
+; PTX70-NEXT: st.param.b64 [func_retval0], %rd2;
+; PTX70-NEXT: ret;
+;
+; PTX71-LABEL: bswap64(
+; PTX71: {
+; PTX71-NEXT: .reg .b32 %r<5>;
+; PTX71-NEXT: .reg .b64 %rd<3>;
+; PTX71-EMPTY:
+; PTX71-NEXT: // %bb.0:
+; PTX71-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
+; PTX71-NEXT: mov.b64 {%r1, _}, %rd1;
+; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 0x123U;
+; PTX71-NEXT: mov.b64 {_, %r3}, %rd1;
+; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 0x123U;
+; PTX71-NEXT: mov.b64 %rd2, {%r4, %r2};
+; PTX71-NEXT: st.param.b64 [func_retval0], %rd2;
+; PTX71-NEXT: ret;
%b = tail call i64 @llvm.bswap.i64(i64 %a)
ret i64 %b
}
diff --git a/llvm/test/CodeGen/NVPTX/prmt.ll b/llvm/test/CodeGen/NVPTX/prmt.ll
new file mode 100644
index 0000000000000..b0ad337a1977f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/prmt.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -verify-machineinstrs | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_prmt_basic(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: test_prmt_basic(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test_prmt_basic_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [test_prmt_basic_param_1];
+; CHECK-NEXT: ld.param.b32 %r3, [test_prmt_basic_param_2];
+; CHECK-NEXT: prmt.b32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %val = call i32 @llvm.nvvm.prmt(i32 %a, i32 %b, i32 %c)
+ ret i32 %val
+}
+
+define i32 @test_prmt_f4e(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: test_prmt_f4e(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test_prmt_f4e_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [test_prmt_f4e_param_1];
+; CHECK-NEXT: ld.param.b32 %r3, [test_prmt_f4e_param_2];
+; CHECK-NEXT: prmt.b32.f4e %r4, %r1, %r2, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %val = call i32 @llvm.nvvm.prmt.f4e(i32 %a, i32 %b, i32 %c)
+ ret i32 %val
+}
+
+define i32 @test_prmt_b4e(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: test_prmt_b4e(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test_prmt_b4e_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [test_prmt_b4e_param_1];
+; CHECK-NEXT: ld.param.b32 %r3, [test_prmt_b4e_param_2];
+; CHECK-NEXT: prmt.b32.b4e %r4, %r1, %r2, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %val = call i32 @llvm.nvvm.prmt.b4e(i32 %a, i32 %b, i32 %c)
+ ret i32 %val
+}
+
+define i32 @test_prmt_rc8(i32 %a, i32 %b) {
+; CHECK-LABEL: test_prmt_rc8(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test_prmt_rc8_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [test_prmt_rc8_param_1];
+; CHECK-NEXT: prmt.b32.rc8 %r3, %r1, 0, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call i32 @llvm.nvvm.prmt.rc8(i32 %a, i32 %b)
+ ret i32 %val
+}
+
+define i32 @test_prmt_ecl(i32 %a, i32 %b) {
+; CHECK-LABEL: test_prmt_ecl(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test_prmt_ecl_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [test_prmt_ecl_param_1];
+; CHECK-NEXT: prmt.b32.ecl %r3, %r1, 0, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call i32 @llvm.nvvm.prmt.ecl(i32 %a, i32 %b)
+ ret i32 %val
+}
+
+define i32 @test_prmt_ecr(i32 %a, i32 %b) {
+; CHECK-LABEL: test_prmt_ecr(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test_prmt_ecr_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [test_prmt_ecr_param_1];
+; CHECK-NEXT: prmt.b32.ecr %r3, %r1, 0, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call i32 @llvm.nvvm.prmt.ecr(i32 %a, i32 %b)
+ ret i32 %val
+}
+
+define i32 @test_prmt_rc16(i32 %a, i32 %b) {
+; CHECK-LABEL: test_prmt_rc16(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test_prmt_rc16_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [test_prmt_rc16_param_1];
+; CHECK-NEXT: prmt.b32.rc16 %r3, %r1, 0, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %val = call i32 @llvm.nvvm.prmt.rc16(i32 %a, i32 %b)
+ ret i32 %val
+}
More information about the llvm-commits
mailing list