[llvm] [NVPTX] Improve copy avoidance during lowering. (PR #106423)

Artem Belevich via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 28 16:05:49 PDT 2024


================
@@ -1,166 +1,300 @@
-; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK32
-; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64
-; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-
-%struct.ham = type { [4 x i32] }
-
-; // Verify that load with static offset into parameter is done directly.
-; CHECK-LABEL: .visible .entry static_offset
-; CHECK-NOT:   .local
-; CHECK64: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: mov.b64         %[[param_addr:rd[0-9]+]], {{.*}}_param_1
-; CHECK64: mov.u64         %[[param_addr1:rd[0-9]+]], %[[param_addr]]
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-;
-; CHECK32: ld.param.u32    [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: mov.b32         %[[param_addr:r[0-9]+]], {{.*}}_param_1
-; CHECK32: mov.u32         %[[param_addr1:r[0-9]+]], %[[param_addr]]
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-;
-; CHECK: ld.param.u32    [[value:%r[0-9]+]], [%[[param_addr1]]+12];
-; CHECK: st.global.u32   [[[result_addr_g]]], [[value]];
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @static_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
-bb:
-  %tmp = icmp eq i32 %arg2, 3
-  br i1 %tmp, label %bb3, label %bb6
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple nvptx -mcpu=sm_70 -nvptx-lower-args -S | FileCheck %s --check-prefixes=CHECK,CHECK32
+source_filename = "<stdin>"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.S = type { i32, i32 }
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+declare dso_local void @_Z6escapePv(ptr noundef) local_unnamed_addr #0
 
-bb3:                                              ; preds = %bb
-  %tmp4 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 3
-  %tmp5 = load i32, ptr %tmp4, align 4
-  store i32 %tmp5, ptr %arg, align 4
-  br label %bb6
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+declare dso_local void @_Z6escapei(i32 noundef) local_unnamed_addr #0
 
-bb6:                                              ; preds = %bb3, %bb
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @read_only(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
+; CHECK-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i = load i32, ptr %s, align 4
+  store i32 %i, ptr %out, align 4
   ret void
 }
 
-; // Verify that load with dynamic offset into parameter is also done directly.
-; CHECK-LABEL: .visible .entry dynamic_offset
-; CHECK-NOT:   .local
-; CHECK64: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: mov.b64         %[[param_addr:rd[0-9]+]], {{.*}}_param_1
-; CHECK64: mov.u64         %[[param_addr1:rd[0-9]+]], %[[param_addr]]
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-; CHECK64: add.s64         %[[param_w_offset:rd[0-9]+]], %[[param_addr1]],
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @read_only_gep(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
+; CHECK-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
+; CHECK-NEXT:    ret void
 ;
-; CHECK32: ld.param.u32    [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: mov.b32         %[[param_addr:r[0-9]+]], {{.*}}_param_1
-; CHECK32: mov.u32         %[[param_addr1:r[0-9]+]], %[[param_addr]]
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-; CHECK32: add.s32         %[[param_w_offset:r[0-9]+]], %[[param_addr1]],
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  %i = load i32, ptr %b, align 4
+  store i32 %i, ptr %out, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @read_only_gep_asc(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
+; CHECK-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
+; CHECK-NEXT:    ret void
 ;
-; CHECK: ld.param.u32    [[value:%r[0-9]+]], [%[[param_w_offset]]];
-; CHECK: st.global.u32   [[[result_addr_g]]], [[value]];
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  %asc = addrspacecast ptr %b to ptr addrspace(101)
+  %i = load i32, ptr addrspace(101) %asc, align 4
+  store i32 %i, ptr %out, align 4
+  ret void
+}
 
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @dynamic_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
-bb:
-  %tmp = sext i32 %arg2 to i64
-  %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
-  %tmp4 = load i32, ptr %tmp3, align 4
-  store i32 %tmp4, ptr %arg, align 4
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @read_only_gep_asc0(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; CHECK-NEXT:    [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; CHECK-NEXT:    [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr [[ASC0]], align 4
+; CHECK-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  %asc = addrspacecast ptr %b to ptr addrspace(101)
+  %asc0 = addrspacecast ptr addrspace(101) %asc to ptr
+  %i = load i32, ptr %asc0, align 4
+  store i32 %i, ptr %out, align 4
   ret void
 }
 
-; Same as above, but with a bitcast present in the chain
-; CHECK-LABEL:.visible .entry gep_bitcast
-; CHECK-NOT: .local
-; CHECK64-DAG: ld.param.u64    [[out:%rd[0-9]+]], [gep_bitcast_param_0]
-; CHECK64-DAG: mov.b64         {{%rd[0-9]+}}, gep_bitcast_param_1
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_ptr(
+; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[S3]]) #[[ATTR0]]
+; CHECK-NEXT:    ret void
 ;
-; CHECK32-DAG: ld.param.u32    [[out:%r[0-9]+]], [gep_bitcast_param_0]
-; CHECK32-DAG: mov.b32         {{%r[0-9]+}}, gep_bitcast_param_1
+entry:
+  call void @_Z6escapePv(ptr noundef nonnull %s) #0
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_ptr_gep(
+; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; CHECK-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR0]]
+; CHECK-NEXT:    ret void
 ;
-; CHECK-DAG: ld.param.u32    {{%r[0-9]+}}, [gep_bitcast_param_2]
-; CHECK64:     ld.param.u8     [[value:%rs[0-9]+]], [{{%rd[0-9]+}}]
-; CHECK64:     st.global.u8    [{{%rd[0-9]+}}], [[value]];
-; CHECK32:     ld.param.u8     [[value:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK32:     st.global.u8    [{{%r[0-9]+}}], [[value]];
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  call void @_Z6escapePv(ptr noundef nonnull %b) #0
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_ptr_store(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[I:%.*]] = ptrtoint ptr [[S3]] to i64
+; CHECK-NEXT:    store i64 [[I]], ptr [[OUT2]], align 8
+; CHECK-NEXT:    ret void
 ;
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @gep_bitcast(ptr nocapture %out,  ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
-bb:
-  %n64 = sext i32 %n to i64
-  %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
-  %load = load i8, ptr %gep, align 4
-  store i8 %load, ptr %out, align 4
+entry:
+  %i = ptrtoint ptr %s to i64
+  store i64 %i, ptr %out, align 8
   ret void
 }
 
-; Same as above, but with an ASC(101) present in the chain
-; CHECK-LABEL:.visible .entry gep_bitcast_asc
-; CHECK-NOT: .local
-; CHECK64-DAG: ld.param.u64    [[out:%rd[0-9]+]], [gep_bitcast_asc_param_0]
-; CHECK64-DAG: mov.b64         {{%rd[0-9]+}}, gep_bitcast_asc_param_1
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_ptr_gep_store(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; CHECK-NEXT:    [[I:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    store i64 [[I]], ptr [[OUT2]], align 8
+; CHECK-NEXT:    ret void
 ;
-; CHECK32-DAG: ld.param.u32    [[out:%r[0-9]+]], [gep_bitcast_asc_param_0]
-; CHECK32-DAG: mov.b32         {{%r[0-9]+}}, gep_bitcast_asc_param_1
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  %i = ptrtoint ptr %b to i64
+  store i64 %i, ptr %out, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_math_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
----------------
Artem-B wrote:

Reduced to `escape_ptrtoint`

https://github.com/llvm/llvm-project/pull/106423


More information about the llvm-commits mailing list