[llvm] [NVPTX] expand trunc v2i32->v2i16 (PR #161715)
Artem Belevich via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 2 12:19:12 PDT 2025
https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/161715
>From d10c4152c56522ad175663d0ee55a2bd48ab198e Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Thu, 2 Oct 2025 10:57:24 -0700
Subject: [PATCH 1/3] [NVPTX] expand trunc v2i32->v2i16
#153478 made v2i32 legal on newer GPUs, but we can not lower all operations yet.
Expand the `trunc` operation until we implement efficient lowering.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 3 +++
1 file changed, 3 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3ac7c2874408b..48e539037dcc7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -638,6 +638,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// No support for these operations with v2f32/v2i32
setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
+
+ setOperationAction(ISD::TRUNCATE, MVT::v2i16, Expand);
+
// Need custom lowering in case the index is dynamic.
if (STI.hasF32x2Instructions())
setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},
>From 23da2a5c5b5938aa974755ba269dad45f0e39840 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Thu, 2 Oct 2025 11:10:52 -0700
Subject: [PATCH 2/3] added a test case
---
.../test/CodeGen/NVPTX/f32x2-convert-i32x2.ll | 37 +++++++++++++++++++
1 file changed, 37 insertions(+)
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
index 18fb87935d17d..4bfae5c437b85 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
@@ -115,5 +115,42 @@ define ptx_kernel void @inlineasm(ptr %p) {
store <2 x float> %mul, ptr %p, align 8
ret void
}
+
+define ptx_kernel void @trunc_v2i32(<2 x i32> %0) {
+; CHECK-SM90A-LABEL: trunc_v2i32(
+; CHECK-SM90A: {
+; CHECK-SM90A-NEXT: .reg .b32 %r<7>;
+; CHECK-SM90A-NEXT: .reg .b64 %rd<2>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT: // %bb.0:
+; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r1, %r2}, [trunc_v2i32_param_0];
+; CHECK-SM90A-NEXT: prmt.b32 %r3, %r1, %r2, 0x3340U;
+; CHECK-SM90A-NEXT: mov.b32 %r4, 0;
+; CHECK-SM90A-NEXT: prmt.b32 %r5, %r4, 0, 0x3340U;
+; CHECK-SM90A-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U;
+; CHECK-SM90A-NEXT: mov.b64 %rd1, 0;
+; CHECK-SM90A-NEXT: st.b32 [%rd1], %r6;
+; CHECK-SM90A-NEXT: ret;
+;
+; CHECK-SM100-LABEL: trunc_v2i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<7>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<3>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.b64 %rd1, [trunc_v2i32_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-SM100-NEXT: mov.b32 %r3, 0;
+; CHECK-SM100-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U;
+; CHECK-SM100-NEXT: prmt.b32 %r5, %r1, %r2, 0x3340U;
+; CHECK-SM100-NEXT: prmt.b32 %r6, %r4, %r5, 0x5410U;
+; CHECK-SM100-NEXT: mov.b64 %rd2, 0;
+; CHECK-SM100-NEXT: st.b32 [%rd2], %r6;
+; CHECK-SM100-NEXT: ret;
+ %2 = trunc <2 x i32> %0 to <2 x i8>
+ %3 = shufflevector <2 x i8> zeroinitializer, <2 x i8> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ store <4 x i8> %3, ptr null, align 4
+ ret void
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
>From f52e708b355baaf149c8aea9f1dd613fd840f3d4 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Thu, 2 Oct 2025 12:18:00 -0700
Subject: [PATCH 3/3] expand *ext operations, too.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 +
.../test/CodeGen/NVPTX/f32x2-convert-i32x2.ll | 108 ++++++++++++++++++
2 files changed, 110 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 48e539037dcc7..8c21746c4369e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -640,6 +640,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v2i16, Expand);
+ setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
+ MVT::v2i32, Expand);
// Need custom lowering in case the index is dynamic.
if (STI.hasF32x2Instructions())
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
index 4bfae5c437b85..21ca041f6220a 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
@@ -152,5 +152,113 @@ define ptx_kernel void @trunc_v2i32(<2 x i32> %0) {
store <4 x i8> %3, ptr null, align 4
ret void
}
+
+define ptx_kernel void @zextend_to_v2i32(<2 x i8> %0) {
+; CHECK-SM90A-LABEL: zextend_to_v2i32(
+; CHECK-SM90A: {
+; CHECK-SM90A-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM90A-NEXT: .reg .b32 %r<4>;
+; CHECK-SM90A-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT: // %bb.0:
+; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0];
+; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-SM90A-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-SM90A-NEXT: mov.b64 %rd1, 12;
+; CHECK-SM90A-NEXT: st.b32 [%rd1], %r3;
+; CHECK-SM90A-NEXT: mov.b64 %rd2, 8;
+; CHECK-SM90A-NEXT: st.b32 [%rd2], %r2;
+; CHECK-SM90A-NEXT: mov.b64 %rd3, 4;
+; CHECK-SM90A-NEXT: st.b32 [%rd3], 0;
+; CHECK-SM90A-NEXT: mov.b64 %rd4, 0;
+; CHECK-SM90A-NEXT: st.b32 [%rd4], 0;
+; CHECK-SM90A-NEXT: ret;
+;
+; CHECK-SM100-LABEL: zextend_to_v2i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM100-NEXT: .reg .b32 %r<5>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<8>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0];
+; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-SM100-NEXT: cvt.u32.u16 %r3, %rs1;
+; CHECK-SM100-NEXT: mov.b64 %rd1, {%r3, %r2};
+; CHECK-SM100-NEXT: mov.b32 %r4, 0;
+; CHECK-SM100-NEXT: mov.b64 %rd2, {%r4, %r4};
+; CHECK-SM100-NEXT: mov.b64 %rd3, 4;
+; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2;
+; CHECK-SM100-NEXT: mov.b64 %rd4, 0;
+; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2;
+; CHECK-SM100-NEXT: mov.b64 %rd5, 8;
+; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1;
+; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32;
+; CHECK-SM100-NEXT: mov.b64 %rd7, 12;
+; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6;
+; CHECK-SM100-NEXT: ret;
+ %2 = zext <2 x i8> %0 to <2 x i32>
+ %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ store <4 x i32> %3, ptr null, align 4
+ ret void
+}
+
+define ptx_kernel void @sextend_to_v2i32(<2 x i8> %0) {
+; CHECK-SM90A-LABEL: sextend_to_v2i32(
+; CHECK-SM90A: {
+; CHECK-SM90A-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM90A-NEXT: .reg .b32 %r<6>;
+; CHECK-SM90A-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT: // %bb.0:
+; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0];
+; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-SM90A-NEXT: cvt.s32.s8 %r3, %r2;
+; CHECK-SM90A-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-SM90A-NEXT: cvt.s32.s8 %r5, %r4;
+; CHECK-SM90A-NEXT: mov.b64 %rd1, 12;
+; CHECK-SM90A-NEXT: st.b32 [%rd1], %r5;
+; CHECK-SM90A-NEXT: mov.b64 %rd2, 8;
+; CHECK-SM90A-NEXT: st.b32 [%rd2], %r3;
+; CHECK-SM90A-NEXT: mov.b64 %rd3, 4;
+; CHECK-SM90A-NEXT: st.b32 [%rd3], 0;
+; CHECK-SM90A-NEXT: mov.b64 %rd4, 0;
+; CHECK-SM90A-NEXT: st.b32 [%rd4], 0;
+; CHECK-SM90A-NEXT: ret;
+;
+; CHECK-SM100-LABEL: sextend_to_v2i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM100-NEXT: .reg .b32 %r<7>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<8>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0];
+; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-SM100-NEXT: cvt.s32.s8 %r3, %r2;
+; CHECK-SM100-NEXT: cvt.u32.u16 %r4, %rs1;
+; CHECK-SM100-NEXT: cvt.s32.s8 %r5, %r4;
+; CHECK-SM100-NEXT: mov.b64 %rd1, {%r5, %r3};
+; CHECK-SM100-NEXT: mov.b32 %r6, 0;
+; CHECK-SM100-NEXT: mov.b64 %rd2, {%r6, %r6};
+; CHECK-SM100-NEXT: mov.b64 %rd3, 4;
+; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2;
+; CHECK-SM100-NEXT: mov.b64 %rd4, 0;
+; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2;
+; CHECK-SM100-NEXT: mov.b64 %rd5, 8;
+; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1;
+; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32;
+; CHECK-SM100-NEXT: mov.b64 %rd7, 12;
+; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6;
+; CHECK-SM100-NEXT: ret;
+ %2 = sext <2 x i8> %0 to <2 x i32>
+ %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ store <4 x i32> %3, ptr null, align 4
+ ret void
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
More information about the llvm-commits
mailing list