[llvm] [NVPTX] Add support for nvvm.flo.[us] intrinsics (PR #114489)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 31 16:24:55 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-nvptx
Author: Alex MacLean (AlexMaclean)
<details>
<summary>Changes</summary>
Add support for '`llvm.nvvm.flo.[su].*`' intrinsics which correspond to a PTX `bfind` instruction. See [PTX ISA 9.7.1.16. Integer Arithmetic Instructions: bfind](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfind)
The '`llvm.nvvm.flo.u`' family of intrinsics identifies the bit position of the leading one, returning either it's offset from the most or least significant bit.
The '`llvm.nvvm.flo.s`' family of intrinsics identifies the bit position of the leading non-sign bit, returning either it's offset from the most or least significant bit.
---
Full diff: https://github.com/llvm/llvm-project/pull/114489.diff
4 Files Affected:
- (modified) llvm/docs/NVPTXUsage.rst (+53)
- (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+8)
- (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+32)
- (added) llvm/test/CodeGen/NVPTX/flo.ll (+132)
``````````diff
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index b7097308f6e890..f225b9e8bd268b 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -378,6 +378,59 @@ right, and the least significant bits are extracted to produce a result that is
the same size as the original arguments. The shift amount is the minimum of the
value of %n and the bit width of the integer type.
+'``llvm.nvvm.flo.u.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare i32 @llvm.nvvm.flo.u.i32(i32 %a, i1 %shiftamt)
+ declare i32 @llvm.nvvm.flo.u.i64(i64 %a, i1 %shiftamt)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.flo.u``' family of intrinsics identifies the bit position of the
+leading one, returning either it's offset from the most or least significant bit.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.flo.u``' family of intrinsics returns the bit position of the
+most significant 1. If %shiftamt is true, The result is the shift amount needed
+to left-shift the found bit into the most-significant bit position, otherwise
+the result is the shift amount needed to right-shift the found bit into the
+least-significant bit position. 0xffffffff is returned if no 1 bit is found.
+
+'``llvm.nvvm.flo.s.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare i32 @llvm.nvvm.flo.s.i32(i32 %a, i1 %shiftamt)
+ declare i32 @llvm.nvvm.flo.s.i64(i64 %a, i1 %shiftamt)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.flo.s``' family of intrinsics identifies the bit position of the
+leading non-sign bit, returning either it's offset from the most or least
+significant bit.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.flo.s``' family of intrinsics returns the bit position of the
+most significant 0 for negative inputs and the most significant 1 for
+non-negative inputs. If %shiftamt is true, The result is the shift amount needed
+to left-shift the found bit into the most-significant bit position, otherwise
+the result is the shift amount needed to right-shift the found bit into the
+least-significant bit position. 0xffffffff is returned if no 1 bit is found.
Other Intrinsics
----------------
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 5164f873d00f48..fd0cbed8b25661 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1092,6 +1092,14 @@ let TargetPrefix = "nvvm" in {
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+//
+// FLO - Find Leading One
+//
+ foreach sign = ["s", "u"] in
+ def int_nvvm_flo_ # sign :
+ DefaultAttrsIntrinsic<[llvm_i32_ty],
+ [llvm_anyint_ty, llvm_i1_ty],
+ [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
//
// Convert
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index f5ac3c4e964363..1f4938d9fcf5a5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -6,6 +6,19 @@
//
//===----------------------------------------------------------------------===//
+// Utility class to wrap up information about a register and DAG type for more
+// convenient iteration and parameterization
+class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm> {
+ ValueType Ty = ty;
+ NVPTXRegClass RC = rc;
+ Operand Imm = imm;
+ int Size = ty.Size;
+}
+
+def I32RT : RegTyInfo<i32, Int32Regs, i32imm>;
+def I64RT : RegTyInfo<i64, Int64Regs, i64imm>;
+
+
def immFloat0 : PatLeaf<(fpimm), [{
float f = (float)N->getValueAPF().convertToFloat();
return (f==0.0f);
@@ -1299,6 +1312,25 @@ def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;",
def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
+//
+// BFIND
+//
+
+foreach t = [I32RT, I64RT] in {
+ foreach sign = ["s", "u"] in {
+ defvar flo_intrin = !cast<Intrinsic>("int_nvvm_flo_" # sign);
+ def BFIND_ # sign # t.Size
+ : NVPTXInst<(outs Int32Regs:$dst), (ins t.RC:$src),
+ "bfind." # sign # t.Size # " \t$dst, $src;",
+ [(set (i32 Int32Regs:$dst), (flo_intrin (t.Ty t.RC:$src), 0))]>;
+
+ def BFIND_SHIFTAMT_ # sign # t.Size
+ : NVPTXInst<(outs Int32Regs:$dst), (ins t.RC:$src),
+ "bfind.shiftamt." # sign # t.Size # " \t$dst, $src;",
+ [(set (i32 Int32Regs:$dst), (flo_intrin (t.Ty t.RC:$src), -1))]>;
+ }
+}
+
//
// Convert
//
diff --git a/llvm/test/CodeGen/NVPTX/flo.ll b/llvm/test/CodeGen/NVPTX/flo.ll
new file mode 100644
index 00000000000000..bc7f765e40ab43
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/flo.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @flo_1(i32 %a) {
+; CHECK-LABEL: flo_1(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [flo_1_param_0];
+; CHECK-NEXT: bfind.s32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %r = call i32 @llvm.nvvm.flo.s.i32(i32 %a, i1 false)
+ ret i32 %r
+}
+
+
+define i32 @flo_2(i32 %a) {
+; CHECK-LABEL: flo_2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [flo_2_param_0];
+; CHECK-NEXT: bfind.shiftamt.s32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %r = call i32 @llvm.nvvm.flo.s.i32(i32 %a, i1 true)
+ ret i32 %r
+}
+
+define i32 @flo_3(i32 %a) {
+; CHECK-LABEL: flo_3(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [flo_3_param_0];
+; CHECK-NEXT: bfind.u32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %r = call i32 @llvm.nvvm.flo.u.i32(i32 %a, i1 false)
+ ret i32 %r
+}
+
+
+define i32 @flo_4(i32 %a) {
+; CHECK-LABEL: flo_4(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [flo_4_param_0];
+; CHECK-NEXT: bfind.shiftamt.u32 %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %r = call i32 @llvm.nvvm.flo.u.i32(i32 %a, i1 true)
+ ret i32 %r
+}
+
+
+
+define i32 @flo_5(i64 %a) {
+; CHECK-LABEL: flo_5(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [flo_5_param_0];
+; CHECK-NEXT: bfind.s64 %r1, %rd1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %r = call i32 @llvm.nvvm.flo.s.i64(i64 %a, i1 false)
+ ret i32 %r
+}
+
+
+define i32 @flo_6(i64 %a) {
+; CHECK-LABEL: flo_6(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [flo_6_param_0];
+; CHECK-NEXT: bfind.shiftamt.s64 %r1, %rd1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %r = call i32 @llvm.nvvm.flo.s.i64(i64 %a, i1 true)
+ ret i32 %r
+}
+
+define i32 @flo_7(i64 %a) {
+; CHECK-LABEL: flo_7(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [flo_7_param_0];
+; CHECK-NEXT: bfind.u64 %r1, %rd1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %r = call i32 @llvm.nvvm.flo.u.i64(i64 %a, i1 false)
+ ret i32 %r
+}
+
+
+define i32 @flo_8(i64 %a) {
+; CHECK-LABEL: flo_8(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [flo_8_param_0];
+; CHECK-NEXT: bfind.shiftamt.u64 %r1, %rd1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %r = call i32 @llvm.nvvm.flo.u.i64(i64 %a, i1 true)
+ ret i32 %r
+}
+
+declare i32 @llvm.nvvm.flo.s.i32(i32, i1)
+declare i32 @llvm.nvvm.flo.u.i32(i32, i1)
+declare i32 @llvm.nvvm.flo.s.i64(i64, i1)
+declare i32 @llvm.nvvm.flo.u.i64(i64, i1)
``````````
</details>
https://github.com/llvm/llvm-project/pull/114489
More information about the llvm-commits
mailing list