[llvm] [NVPTX] Auto-Upgrade some nvvm.annotations to attributes (PR #119261)

Han-Chung Wang via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 5 07:45:21 PST 2025


hanhanW wrote:

Hi, we run into a failure in downstream project (i.e., IREE) with the change. I'm not an expert of this area, but I'd like to see how to fix our problem properly. Without the test, we are generating something like:

```
.visible .entry add_dispatch(
	.param .u64 .ptr .global .align 16 add_dispatch_param_0,
	.param .u64 .ptr .global .align 16 add_dispatch_param_1,
	.param .u64 .ptr .global .align 16 add_dispatch_param_2
)
.maxntid 32, 1, 1
{
	.reg .pred 	%p<2>;
	.reg .b32 	%r<2>;
	.reg .f32 	%f<4>;
	.reg .b64 	%rd<8>;

	mov.u32 	%r1, %tid.x;
	bar.sync 	0;
	setp.gt.u32 	%p1, %r1, 15;
	@%p1 bra 	$L__BB0_2;
	ld.param.u64 	%rd4, [add_dispatch_param_0];
	ld.param.u64 	%rd5, [add_dispatch_param_1];
	ld.param.u64 	%rd6, [add_dispatch_param_2];
	mul.wide.u32 	%rd7, %r1, 4;
	add.s64 	%rd1, %rd4, %rd7;
	add.s64 	%rd2, %rd5, %rd7;
	add.s64 	%rd3, %rd6, %rd7;
	ld.global.f32 	%f1, [%rd1];
	ld.global.f32 	%f2, [%rd2];
	add.rn.f32 	%f3, %f1, %f2;
	st.global.f32 	[%rd3], %f3;
$L__BB0_2:
	bar.sync 	0;
	ret;

}
```

With the patch, the `entry` becomes `func`, which is probably fine. However, the `maxntid` disappears. According to NVIDIA doc, it specifies the maximum number of threads that a thread block can have. Is it valid to drop such information with the patch? Below is the generated PTX with the patch.


```
.visible .func add_dispatch(
	.param .b64 add_dispatch_param_0,
	.param .b64 add_dispatch_param_1,
	.param .b64 add_dispatch_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .b32 	%r<2>;
	.reg .f32 	%f<4>;
	.reg .b64 	%rd<8>;

	mov.u32 	%r1, %tid.x;
	bar.sync 	0;
	setp.gt.u32 	%p1, %r1, 15;
	@%p1 bra 	$L__BB0_2;
	ld.param.u64 	%rd4, [add_dispatch_param_0];
	ld.param.u64 	%rd5, [add_dispatch_param_1];
	ld.param.u64 	%rd6, [add_dispatch_param_2];
	mul.wide.u32 	%rd7, %r1, 4;
	add.s64 	%rd1, %rd4, %rd7;
	add.s64 	%rd2, %rd5, %rd7;
	add.s64 	%rd3, %rd6, %rd7;
	ld.global.f32 	%f1, [%rd1];
	ld.global.f32 	%f2, [%rd2];
	add.rn.f32 	%f3, %f1, %f2;
	st.global.f32 	[%rd3], %f3;
$L__BB0_2:
	bar.sync 	0;
	ret;

}
```

I can provide more artifacts if it helps, thanks in advance!

https://github.com/llvm/llvm-project/pull/119261


More information about the llvm-commits mailing list