[llvm] [NVPTX] Improve copy avoidance during lowering. (PR #106423)
Artem Belevich via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 28 13:58:36 PDT 2024
================
@@ -49,41 +71,85 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
; PTX-LABEL: grid_const_escape(
; PTX: {
-; PTX-NOT: .local
-; PTX: cvta.param.{{.*}}
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<4>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, grid_const_escape_param_0;
+; PTX-NEXT: mov.u64 %rd2, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
+; PTX-NEXT: { // callseq 0, 0
+; PTX-NEXT: .param .b64 param0;
+; PTX-NEXT: st.param.b64 [param0+0], %rd3;
+; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: call.uni (retval0),
+; PTX-NEXT: escape,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: ld.param.b32 %r1, [retval0+0];
+; PTX-NEXT: } // callseq 0
+; PTX-NEXT: ret;
; OPT-LABEL: define void @grid_const_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) {
-; OPT-NOT: alloca [[STRUCT_S]]
-; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
+; OPT-NEXT: ret void
%call = call i32 @escape(ptr %input)
ret void
}
define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
; PTX-LABEL: multiple_grid_const_escape(
-; PTX: mov.{{.*}} [[RD1:%.*]], multiple_grid_const_escape_param_0;
-; PTX: mov.{{.*}} [[RD2:%.*]], multiple_grid_const_escape_param_2;
-; PTX: mov.{{.*}} [[RD3:%.*]], [[RD2]];
-; PTX: mov.{{.*}} [[RD4:%.*]], [[RD1]];
-; PTX: cvta.param.{{.*}} [[RD5:%.*]], [[RD4]];
-; PTX: cvta.param.{{.*}} [[RD6:%.*]], [[RD3]];
-; PTX: {
-; PTX: st.param.b64 [param0+0], [[RD5]];
-; PTX: st.param.b64 [param2+0], [[RD6]];
-;
+; PTX: {
+; PTX-NEXT: .local .align 4 .b8 __local_depot3[4];
+; PTX-NEXT: .reg .b64 %SP;
+; PTX-NEXT: .reg .b64 %SPL;
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b64 %rd<9>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.u64 %SPL, __local_depot3;
+; PTX-NEXT: cvta.local.u64 %SP, %SPL;
+; PTX-NEXT: mov.b64 %rd1, multiple_grid_const_escape_param_0;
+; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_2;
+; PTX-NEXT: mov.u64 %rd3, %rd2;
+; PTX-NEXT: ld.param.u32 %r1, [multiple_grid_const_escape_param_1];
+; PTX-NEXT: cvta.param.u64 %rd4, %rd3;
+; PTX-NEXT: mov.u64 %rd5, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd6, %rd5;
+; PTX-NEXT: add.u64 %rd7, %SP, 0;
+; PTX-NEXT: add.u64 %rd8, %SPL, 0;
+; PTX-NEXT: st.local.u32 [%rd8], %r1;
+; PTX-NEXT: { // callseq 1, 0
+; PTX-NEXT: .param .b64 param0;
+; PTX-NEXT: st.param.b64 [param0+0], %rd6;
+; PTX-NEXT: .param .b64 param1;
+; PTX-NEXT: st.param.b64 [param1+0], %rd7;
+; PTX-NEXT: .param .b64 param2;
+; PTX-NEXT: st.param.b64 [param2+0], %rd4;
+; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: call.uni (retval0),
+; PTX-NEXT: escape3,
+; PTX-NEXT: (
+; PTX-NEXT: param0,
+; PTX-NEXT: param1,
+; PTX-NEXT: param2
+; PTX-NEXT: );
+; PTX-NEXT: ld.param.b32 %r2, [retval0+0];
+; PTX-NEXT: } // callseq 1
+; PTX-NEXT: ret;
; OPT-LABEL: define void @multiple_grid_const_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) {
-; OPT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NOT: alloca %struct.s
-; OPT: [[A_ADDR:%.*]] = alloca i32, align 4
-; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; OPT-NEXT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
----------------
Artem-B wrote:
Never mind. This is the same alloca that's in the original IR. Lowering still works fine.
https://github.com/llvm/llvm-project/pull/106423
More information about the llvm-commits
mailing list