[llvm] [NVPTX] pull in v2i32 build_vector through v2f32 bitcast (PR #153478)
Artem Belevich via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 14 13:55:24 PDT 2025
================
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM90A %s
+; RUN: %if ptxas-12.7 %{ \
+; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_90a \
+; RUN: %}
+; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM100 %s
+; RUN: %if ptxas-12.7 %{ \
+; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \
+; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \
+; RUN: %}
+
+; Test that v2i32 -> v2f32 conversions don't emit bitwise operations on i64.
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare <2 x i32> @return_i32x2(i32 %0)
+
+; Test with v2i32.
+define ptx_kernel void @store_i32x2(i32 %0, ptr %p) {
+; CHECK-SM90A-LABEL: store_i32x2(
+; CHECK-SM90A: {
+; CHECK-SM90A-NEXT: .reg .b32 %r<8>;
+; CHECK-SM90A-NEXT: .reg .b64 %rd<2>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT: // %bb.0:
+; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1];
+; CHECK-SM90A-NEXT: ld.param.b32 %r1, [store_i32x2_param_0];
+; CHECK-SM90A-NEXT: { // callseq 0, 0
+; CHECK-SM90A-NEXT: .param .b32 param0;
+; CHECK-SM90A-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-SM90A-NEXT: st.param.b32 [param0], %r1;
+; CHECK-SM90A-NEXT: call.uni (retval0), return_i32x2, (param0);
+; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r2, %r3}, [retval0];
+; CHECK-SM90A-NEXT: } // callseq 0
+; CHECK-SM90A-NEXT: add.rn.f32 %r6, %r3, %r3;
+; CHECK-SM90A-NEXT: add.rn.f32 %r7, %r2, %r2;
+; CHECK-SM90A-NEXT: st.v2.b32 [%rd1], {%r7, %r6};
+; CHECK-SM90A-NEXT: ret;
+;
+; CHECK-SM100-LABEL: store_i32x2(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<6>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<4>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1];
+; CHECK-SM100-NEXT: ld.param.b32 %r1, [store_i32x2_param_0];
+; CHECK-SM100-NEXT: { // callseq 0, 0
+; CHECK-SM100-NEXT: .param .b32 param0;
+; CHECK-SM100-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-SM100-NEXT: st.param.b32 [param0], %r1;
+; CHECK-SM100-NEXT: call.uni (retval0), return_i32x2, (param0);
+; CHECK-SM100-NEXT: ld.param.v2.b32 {%r2, %r3}, [retval0];
+; CHECK-SM100-NEXT: } // callseq 0
+; CHECK-SM100-NEXT: mov.b64 %rd2, {%r2, %r3};
+; CHECK-SM100-NEXT: add.rn.f32x2 %rd3, %rd2, %rd2;
+; CHECK-SM100-NEXT: st.b64 [%rd1], %rd3;
+; CHECK-SM100-NEXT: ret;
+ %v = call <2 x i32> @return_i32x2(i32 %0)
+ %v.f32x2 = bitcast <2 x i32> %v to <2 x float>
+ %res = fadd <2 x float> %v.f32x2, %v.f32x2
+ store <2 x float> %res, ptr %p, align 8
+ ret void
+}
+
+; Test with inline ASM returning { <1 x float>, <1 x float> }, which decays to
+; v2i32.
+define ptx_kernel void @inlineasm(ptr %p) {
+; CHECK-SM90A-LABEL: inlineasm(
+; CHECK-SM90A: {
+; CHECK-SM90A-NEXT: .reg .b32 %r<7>;
+; CHECK-SM90A-NEXT: .reg .b64 %rd<2>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT: // %bb.0:
+; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [inlineasm_param_0];
+; CHECK-SM90A-NEXT: mov.b32 %r3, 0;
+; CHECK-SM90A-NEXT: mov.b32 %r4, %r3;
+; CHECK-SM90A-NEXT: mov.b32 %r2, %r4;
+; CHECK-SM90A-NEXT: mov.b32 %r1, %r3;
----------------
Artem-B wrote:
I suspect that its these moves that are tripping ptxas in our performance regression case. It probably manifests only in larger functions. I'm still working on a reduced reproducer.
https://github.com/llvm/llvm-project/pull/153478
More information about the llvm-commits
mailing list