[llvm] [clang] [NVPTX] Add 'activemask' builtin and intrinsic support (PR #79768)

Joseph Huber via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 29 11:11:06 PST 2024


https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/79768

>From 2c7049defef3b62de7017640948cccfb07ff756c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Sun, 28 Jan 2024 14:57:05 -0600
Subject: [PATCH 1/2] [NVPTX] Add 'activemask' builtin and intrinsic support

Summary:
This patch adds support for getting the 'activemask' instruction's value
without needing to use inline assembly. See the relevant PTX reference
for details.

https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
---
 clang/include/clang/Basic/BuiltinsNVPTX.def |  8 ++++-
 clang/test/CodeGen/builtins-nvptx.c         | 16 ++++++---
 llvm/include/llvm/IR/IntrinsicsNVVM.td      |  8 +++++
 llvm/lib/Target/NVPTX/NVPTX.td              |  4 +--
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td    |  6 ++++
 llvm/test/CodeGen/NVPTX/activemask.ll       | 38 +++++++++++++++++++++
 6 files changed, 73 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/activemask.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 0f2e8260143be78..506288547a15822 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -44,6 +44,7 @@
 #pragma push_macro("PTX42")
 #pragma push_macro("PTX60")
 #pragma push_macro("PTX61")
+#pragma push_macro("PTX62")
 #pragma push_macro("PTX63")
 #pragma push_macro("PTX64")
 #pragma push_macro("PTX65")
@@ -76,7 +77,8 @@
 #define PTX65 "ptx65|" PTX70
 #define PTX64 "ptx64|" PTX65
 #define PTX63 "ptx63|" PTX64
-#define PTX61 "ptx61|" PTX63
+#define PTX62 "ptx62|" PTX63
+#define PTX61 "ptx61|" PTX62
 #define PTX60 "ptx60|" PTX61
 #define PTX42 "ptx42|" PTX60
 
@@ -632,6 +634,9 @@ TARGET_BUILTIN(__nvvm_vote_any_sync, "bUib", "", PTX60)
 TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", PTX60)
 TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", PTX60)
 
+// Mask
+TARGET_BUILTIN(__nvvm_activemask, "i", "n", PTX62)
+
 // Match
 TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", AND(SM_70,PTX60))
 TARGET_BUILTIN(__nvvm_match_any_sync_i64, "UiUiWi", "", AND(SM_70,PTX60))
@@ -1065,6 +1070,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
 #pragma pop_macro("PTX42")
 #pragma pop_macro("PTX60")
 #pragma pop_macro("PTX61")
+#pragma pop_macro("PTX62")
 #pragma pop_macro("PTX63")
 #pragma pop_macro("PTX64")
 #pragma pop_macro("PTX65")
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 353f3ebb608c2b1..a2e73eb1d268bd1 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -5,16 +5,16 @@
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
-// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
-// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 \
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
-// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 \
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 -target-feature +ptx62 \
 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 \
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -target-feature +ptx62 \
 // RUN:   -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
@@ -165,6 +165,14 @@ __device__ void sync() {
 
 }
 
+__device__ void activemask() {
+
+// CHECK: call i32 @llvm.nvvm.activemask()
+
+  __nvvm_activemask();
+
+}
+
 
 // NVVM intrinsics
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 5a5ba2592e1467e..0640fb1f74aa5eb 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4599,6 +4599,14 @@ def int_nvvm_vote_ballot_sync :
             [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.vote.ballot.sync">,
   ClangBuiltin<"__nvvm_vote_ballot_sync">;
 
+//
+// ACTIVEMASK
+//
+def int_nvvm_activemask :
+  Intrinsic<[llvm_i32_ty], [],
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.activemask">,
+  ClangBuiltin<"__nvvm_activemask">;
+
 //
 // MATCH.SYNC
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index f2a4ce381b40b48..a2233d3882b236d 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -40,7 +40,7 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
 
 def SM90a: FeatureSM<"90a", 901>;
 
-foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 63, 64, 65,
+foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
                    70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83] in
   def PTX#version: FeaturePTX<version>;
 
@@ -65,7 +65,7 @@ def : Proc<"sm_61", [SM61, PTX50]>;
 def : Proc<"sm_62", [SM62, PTX50]>;
 def : Proc<"sm_70", [SM70, PTX60]>;
 def : Proc<"sm_72", [SM72, PTX61]>;
-def : Proc<"sm_75", [SM75, PTX63]>;
+def : Proc<"sm_75", [SM75, PTX62, PTX63]>;
 def : Proc<"sm_80", [SM80, PTX70]>;
 def : Proc<"sm_86", [SM86, PTX71]>;
 def : Proc<"sm_87", [SM87, PTX74]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 33f1e4a43e072af..2df931597616566 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -263,6 +263,12 @@ multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntO
            Requires<[hasPTX<60>, hasSM<70>]>;
 }
 
+// activemask.b32
+def ACTIVEMASK : NVPTXInst<(outs Int32Regs:$dest), (ins),
+                    "activemask.b32 \t$dest;", 
+                    [(set Int32Regs:$dest, (int_nvvm_activemask))]>,
+                 Requires<[hasPTX<62>, hasSM<30>]>;
+
 defm MATCH_ANY_SYNC_32 : MATCH_ANY_SYNC<Int32Regs, "b32", int_nvvm_match_any_sync_i32,
                                         i32imm>;
 defm MATCH_ANY_SYNC_64 : MATCH_ANY_SYNC<Int64Regs, "b64", int_nvvm_match_any_sync_i64,
diff --git a/llvm/test/CodeGen/NVPTX/activemask.ll b/llvm/test/CodeGen/NVPTX/activemask.ll
new file mode 100644
index 000000000000000..1496b2ebdd44270
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/activemask.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=nvptx64 -O2 -mcpu=sm_52 -mattr=+ptx62 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx62 | %ptxas-verify %}
+
+declare i32 @llvm.nvvm.activemask()
+
+; CHECK-LABEL: activemask(
+;
+;      CHECK: activemask.b32  %[[REG:.+]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], %[[REG]];
+; CHECK-NEXT: ret;
+define dso_local i32 @activemask() {
+entry:
+  %mask = call i32 @llvm.nvvm.activemask()
+  ret i32 %mask
+}
+
+; CHECK-LABEL: convergent(
+;
+;      CHECK: activemask.b32  %[[REG:.+]];
+;      CHECK: activemask.b32  %[[REG]];
+;      CHECK: .param.b32    [func_retval0+0], %[[REG]];
+; CHECK-NEXT: ret;
+define dso_local i32 @convergent(i1 %cond) {
+entry:
+  br i1 %cond, label %if.else, label %if.then
+
+if.then:
+  %0 = call i32 @llvm.nvvm.activemask()
+  br label %if.end
+
+if.else:
+  %1 = call i32 @llvm.nvvm.activemask()
+  br label %if.end
+
+if.end:
+  %mask = phi i32 [ %0, %if.then ], [ %1, %if.else ]
+  ret i32 %mask
+}

>From 04a1b8423549ece195941d6a92555fb104bf05e8 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 29 Jan 2024 13:10:54 -0600
Subject: [PATCH 2/2] AddHasSideEffects

---
 llvm/include/llvm/IR/IntrinsicsNVVM.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 0640fb1f74aa5eb..542bbf7f9234cb6 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4604,7 +4604,7 @@ def int_nvvm_vote_ballot_sync :
 //
 def int_nvvm_activemask :
   Intrinsic<[llvm_i32_ty], [],
-            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.activemask">,
+            [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback, IntrHasSideEffects], "llvm.nvvm.activemask">,
   ClangBuiltin<"__nvvm_activemask">;
 
 //



More information about the llvm-commits mailing list