[llvm] [NVPTX] Add intrinsics for the szext instruction (PR #139126)

Thu May 8 16:46:03 PDT 2025

================
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s
+
+target triple = "nvptx-unknown-cuda"
+
+define i32 @szext_wrap_u32(i32 %a, i32 %b) {
+; CHECK-LABEL: szext_wrap_u32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [szext_wrap_u32_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [szext_wrap_u32_param_1];
+; CHECK-NEXT:    szext.wrap.u32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.zext.wrap(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @szext_clamp_u32(i32 %a, i32 %b) {
+; CHECK-LABEL: szext_clamp_u32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [szext_clamp_u32_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [szext_clamp_u32_param_1];
+; CHECK-NEXT:    szext.clamp.u32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.zext.clamp(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @szext_wrap_s32(i32 %a, i32 %b) {
+; CHECK-LABEL: szext_wrap_s32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [szext_wrap_s32_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [szext_wrap_s32_param_1];
+; CHECK-NEXT:    szext.wrap.s32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.sext.wrap(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @szext_clamp_s32(i32 %a, i32 %b) {
+; CHECK-LABEL: szext_clamp_s32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [szext_clamp_s32_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [szext_clamp_s32_param_1];
+; CHECK-NEXT:    szext.clamp.s32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.sext.clamp(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @szext_clamp_s32_ii() {
+; CHECK-LABEL: szext_clamp_s32_ii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 3;
+; CHECK-NEXT:    szext.clamp.s32 %r2, %r1, 4;
----------------
Artem-B wrote:

Agreed, in general, for pure functions that compiler knows and can reason about, which generally means that it can constant-fold them.

For intrinsics it's less clear cut, as they tend to be special cases compiler can't do much about. Those constant parameters may or may not allow the intrinsic to be const-folded, and even when it can, it would still have to be implemented as a special case. E.g. for szext instructions, we could get LLVM to calculate the result of the operation if both constants are known, but we currently don't.

An extra move will be dealt with by ptxas, so I'm OK without `ii` lowering.

https://github.com/llvm/llvm-project/pull/139126