[llvm] 4f70917 - [NVPTX] Add intrinsics for the bmsk instruction (#139299)
via llvm-commits
llvm-commits at lists.llvm.org
Fri May 9 13:03:36 PDT 2025
Author: Alex MacLean
Date: 2025-05-09T13:03:33-07:00
New Revision: 4f70917d6c5ef8132d46854bdd419be37973de6c
URL: https://github.com/llvm/llvm-project/commit/4f70917d6c5ef8132d46854bdd419be37973de6c
DIFF: https://github.com/llvm/llvm-project/commit/4f70917d6c5ef8132d46854bdd419be37973de6c.diff
LOG: [NVPTX] Add intrinsics for the bmsk instruction (#139299)
Added:
llvm/test/CodeGen/NVPTX/bmsk.ll
Modified:
llvm/docs/NVPTXUsage.rst
llvm/include/llvm/IR/IntrinsicsNVVM.td
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Removed:
################################################################################
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index b6222300e4d4a..51bbfd0a5c88d 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -598,6 +598,32 @@ operand %b clamped to the range [0, 32]. The N lowest bits are then
zero-extended the case of the '``zext``' variants, or sign-extended the case of
the '``sext``' variants. If N is 0, the result is 0.
+'``llvm.nvvm.bmsk.{wrap,clamp}``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+ declare i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics creates a bit mask
+given a starting bit position and a bit width.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics returns a value with
+all bits set to 0 except for %b bits starting at bit position %a. For the
+'``wrap``' variants, the values of %a and %b modulo 32 are used. For the
+'``clamp``' variants, the values of %a and %b are clamped to the range [0, 32],
+which in practice is equivalent to using them as is.
+
TMA family of Intrinsics
------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2851206f2e84a..640fdf3f86326 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1367,6 +1367,16 @@ let TargetPrefix = "nvvm" in {
[llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]>;
+
+//
+// BMSK - bit mask
+//
+ foreach mode = ["wrap", "clamp"] in
+ def int_nvvm_bmsk_ # mode :
+ DefaultAttrsIntrinsic<[llvm_i32_ty],
+ [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable]>;
+
//
// Convert
//
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 7b139d7b79e7d..c339817a2d214 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1693,6 +1693,18 @@ foreach sign = ["s", "u"] in {
}
}
+//
+// BMSK
+//
+
+foreach mode = ["wrap", "clamp"] in {
+ defvar intrin = !cast<Intrinsic>("int_nvvm_bmsk_" # mode);
+ defm BMSK_ # mode
+ : I3Inst<"bmsk." # mode # ".b32",
+ intrin, I32RT, commutative = false,
+ requires = [hasSM<70>, hasPTX<76>]>;
+}
+
//
// Convert
//
diff --git a/llvm/test/CodeGen/NVPTX/bmsk.ll b/llvm/test/CodeGen/NVPTX/bmsk.ll
new file mode 100644
index 0000000000000..ead4a42bc6c81
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/bmsk.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %}
+
+target triple = "nvptx64-unknown-cuda"
+
+define i32 @bmsk_wrap(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_wrap(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [bmsk_wrap_param_1];
+; CHECK-NEXT: bmsk.wrap.b32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+ ret i32 %c
+}
+
+define i32 @bmsk_clamp(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_clamp(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [bmsk_clamp_param_1];
+; CHECK-NEXT: bmsk.clamp.b32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+ ret i32 %c
+}
+
+define i32 @bmsk_wrap_ii() {
+; CHECK-LABEL: bmsk_wrap_ii(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.b32 %r1, 5;
+; CHECK-NEXT: bmsk.wrap.b32 %r2, %r1, 6;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 6)
+ ret i32 %c
+}
+
+define i32 @bmsk_clamp_ir(i32 %a) {
+; CHECK-LABEL: bmsk_clamp_ir(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_ir_param_0];
+; CHECK-NEXT: bmsk.clamp.b32 %r2, %r1, 7;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 7)
+ ret i32 %c
+}
+
+define i32 @bmsk_wrap_ri(i32 %a) {
+; CHECK-LABEL: bmsk_wrap_ri(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_ri_param_0];
+; CHECK-NEXT: bmsk.wrap.b32 %r2, 5, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 %a)
+ ret i32 %c
+}
More information about the llvm-commits
mailing list