[llvm] [InferAS] Support getAssumedAddrSpace for Arguments for NVPTX (PR #133991)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 3 13:37:02 PDT 2025
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/133991
>From 7004b4b62aa034f1dd33ed79e000689861230cae Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Thu, 3 Apr 2025 15:38:31 +0000
Subject: [PATCH 1/2] pre-commit tests
---
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 924 ++++++++++++------
.../InferAddressSpaces/NVPTX/arguments.ll | 33 +
2 files changed, 644 insertions(+), 313 deletions(-)
create mode 100644 llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index cfe934544eb3a..13522145678a0 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -1,9 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5
-; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_60
-; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_70
-; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_60
-; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_70
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_60
+; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_70
+; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_60
+; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_70
; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-copy-byval-args -S | FileCheck %s --check-prefixes=COMMON,COPY
+; RUN: llc < %s -mcpu=sm_60 -mattr=ptx77 | FileCheck %s --check-prefixes=PTX,PTX_60
+; RUN: llc < %s -mcpu=sm_70 -mattr=ptx77 | FileCheck %s --check-prefixes=PTX,PTX_70
source_filename = "<stdin>"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
@@ -27,25 +29,15 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @read_only(
-; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_60-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
-; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @read_only(
-; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_70-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
-; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only(
+; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
+; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
+; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @read_only(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
@@ -57,6 +49,17 @@ define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out
; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; COPY-NEXT: ret void
;
+; PTX-LABEL: read_only(
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<2>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %entry
+; PTX-NEXT: ld.param.u64 %rd1, [read_only_param_0];
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: ld.param.u32 %r1, [read_only_param_1];
+; PTX-NEXT: st.global.u32 [%rd2], %r1;
+; PTX-NEXT: ret;
entry:
%i = load i32, ptr %s, align 4
store i32 %i, ptr %out, align 4
@@ -65,27 +68,16 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep(
-; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_60-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
-; SM_60-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
-; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep(
-; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_70-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
-; SM_70-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
-; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep(
+; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
+; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
+; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
+; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -98,6 +90,17 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly
; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; COPY-NEXT: ret void
;
+; PTX-LABEL: read_only_gep(
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<2>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %entry
+; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_param_0];
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_param_1+4];
+; PTX-NEXT: st.global.u32 [%rd2], %r1;
+; PTX-NEXT: ret;
entry:
%b = getelementptr inbounds nuw i8, ptr %s, i64 4
%i = load i32, ptr %b, align 4
@@ -107,27 +110,16 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
-; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_60-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
-; SM_60-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
-; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
-; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_70-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
-; SM_70-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
-; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
+; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
+; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
+; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
+; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -141,6 +133,17 @@ define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeo
; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; COPY-NEXT: ret void
;
+; PTX-LABEL: read_only_gep_asc(
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<2>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %entry
+; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_asc_param_0];
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc_param_1+4];
+; PTX-NEXT: st.global.u32 [%rd2], %r1;
+; PTX-NEXT: ret;
entry:
%b = getelementptr inbounds nuw i8, ptr %s, i64 4
%asc = addrspacecast ptr %b to ptr addrspace(101)
@@ -151,35 +154,20 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
-; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
-; SM_60-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; SM_60-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
-; SM_60-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4
-; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
-; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
-; SM_70-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; SM_70-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
-; SM_70-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4
-; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
+; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
+; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
+; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; LOWER-ARGS-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; LOWER-ARGS-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
+; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4
+; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -194,6 +182,17 @@ define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef write
; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; COPY-NEXT: ret void
;
+; PTX-LABEL: read_only_gep_asc0(
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<2>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %entry
+; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_asc0_param_0];
+; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc0_param_1+4];
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: st.global.u32 [%rd2], %r1;
+; PTX-NEXT: ret;
entry:
%b = getelementptr inbounds nuw i8, ptr %s, i64 4
%asc = addrspacecast ptr %b to ptr addrspace(101)
@@ -205,27 +204,16 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr(
-; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]])
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr(
-; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]])
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @escape_ptr(
+; LOWER-ARGS-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
+; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
+; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; LOWER-ARGS-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]])
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr(
; COPY-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -236,6 +224,33 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out
; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]])
; COPY-NEXT: ret void
;
+; PTX-LABEL: escape_ptr(
+; PTX: {
+; PTX-NEXT: .local .align 4 .b8 __local_depot4[8];
+; PTX-NEXT: .reg .b64 %SP;
+; PTX-NEXT: .reg .b64 %SPL;
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %entry
+; PTX-NEXT: mov.b64 %SPL, __local_depot4;
+; PTX-NEXT: cvta.local.u64 %SP, %SPL;
+; PTX-NEXT: add.u64 %rd1, %SP, 0;
+; PTX-NEXT: add.u64 %rd2, %SPL, 0;
+; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_param_1+4];
+; PTX-NEXT: st.local.u32 [%rd2+4], %r1;
+; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_param_1];
+; PTX-NEXT: st.local.u32 [%rd2], %r2;
+; PTX-NEXT: { // callseq 0, 0
+; PTX-NEXT: .param .b64 param0;
+; PTX-NEXT: st.param.b64 [param0], %rd1;
+; PTX-NEXT: call.uni
+; PTX-NEXT: _Z6escapePv,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: } // callseq 0
+; PTX-NEXT: ret;
entry:
call void @_Z6escapePv(ptr noundef nonnull %s) #0
ret void
@@ -243,29 +258,17 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
-; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
-; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]])
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
-; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
-; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]])
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
+; LOWER-ARGS-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
+; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
+; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; LOWER-ARGS-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; LOWER-ARGS-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]])
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
; COPY-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -277,6 +280,34 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone
; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]])
; COPY-NEXT: ret void
;
+; PTX-LABEL: escape_ptr_gep(
+; PTX: {
+; PTX-NEXT: .local .align 4 .b8 __local_depot5[8];
+; PTX-NEXT: .reg .b64 %SP;
+; PTX-NEXT: .reg .b64 %SPL;
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<4>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %entry
+; PTX-NEXT: mov.b64 %SPL, __local_depot5;
+; PTX-NEXT: cvta.local.u64 %SP, %SPL;
+; PTX-NEXT: add.u64 %rd1, %SP, 0;
+; PTX-NEXT: add.u64 %rd2, %SPL, 0;
+; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_gep_param_1+4];
+; PTX-NEXT: st.local.u32 [%rd2+4], %r1;
+; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_gep_param_1];
+; PTX-NEXT: st.local.u32 [%rd2], %r2;
+; PTX-NEXT: add.s64 %rd3, %rd1, 4;
+; PTX-NEXT: { // callseq 1, 0
+; PTX-NEXT: .param .b64 param0;
+; PTX-NEXT: st.param.b64 [param0], %rd3;
+; PTX-NEXT: call.uni
+; PTX-NEXT: _Z6escapePv,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: } // callseq 1
+; PTX-NEXT: ret;
entry:
%b = getelementptr inbounds nuw i8, ptr %s, i64 4
call void @_Z6escapePv(ptr noundef nonnull %b) #0
@@ -285,27 +316,16 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
-; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_60-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
-; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_70-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
+; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
+; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
+; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; LOWER-ARGS-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -316,6 +336,27 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon
; COPY-NEXT: store ptr [[S1]], ptr [[OUT]], align 8
; COPY-NEXT: ret void
;
+; PTX-LABEL: escape_ptr_store(
+; PTX: {
+; PTX-NEXT: .local .align 4 .b8 __local_depot6[8];
+; PTX-NEXT: .reg .b64 %SP;
+; PTX-NEXT: .reg .b64 %SPL;
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<5>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %entry
+; PTX-NEXT: mov.b64 %SPL, __local_depot6;
+; PTX-NEXT: cvta.local.u64 %SP, %SPL;
+; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_store_param_0];
+; PTX-NEXT: add.u64 %rd2, %SP, 0;
+; PTX-NEXT: add.u64 %rd3, %SPL, 0;
+; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_store_param_1+4];
+; PTX-NEXT: st.local.u32 [%rd3+4], %r1;
+; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_store_param_1];
+; PTX-NEXT: st.local.u32 [%rd3], %r2;
+; PTX-NEXT: cvta.to.global.u64 %rd4, %rd1;
+; PTX-NEXT: st.global.u64 [%rd4], %rd2;
+; PTX-NEXT: ret;
entry:
store ptr %s, ptr %out, align 8
ret void
@@ -323,29 +364,17 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
-; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
-; SM_60-NEXT: store ptr [[B]], ptr [[OUT2]], align 8
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
-; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
-; SM_70-NEXT: store ptr [[B]], ptr [[OUT2]], align 8
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
+; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
+; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
+; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; LOWER-ARGS-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; LOWER-ARGS-NEXT: store ptr [[B]], ptr [[OUT2]], align 8
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -357,6 +386,28 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri
; COPY-NEXT: store ptr [[B]], ptr [[OUT]], align 8
; COPY-NEXT: ret void
;
+; PTX-LABEL: escape_ptr_gep_store(
+; PTX: {
+; PTX-NEXT: .local .align 4 .b8 __local_depot7[8];
+; PTX-NEXT: .reg .b64 %SP;
+; PTX-NEXT: .reg .b64 %SPL;
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %entry
+; PTX-NEXT: mov.b64 %SPL, __local_depot7;
+; PTX-NEXT: cvta.local.u64 %SP, %SPL;
+; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_gep_store_param_0];
+; PTX-NEXT: add.u64 %rd2, %SP, 0;
+; PTX-NEXT: add.u64 %rd3, %SPL, 0;
+; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_gep_store_param_1+4];
+; PTX-NEXT: st.local.u32 [%rd3+4], %r1;
+; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_gep_store_param_1];
+; PTX-NEXT: st.local.u32 [%rd3], %r2;
+; PTX-NEXT: cvta.to.global.u64 %rd4, %rd1;
+; PTX-NEXT: add.s64 %rd5, %rd2, 4;
+; PTX-NEXT: st.global.u64 [%rd4], %rd5;
+; PTX-NEXT: ret;
entry:
%b = getelementptr inbounds nuw i8, ptr %s, i64 4
store ptr %b, ptr %out, align 8
@@ -365,29 +416,17 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
-; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_60-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64
-; SM_60-NEXT: store i64 [[I]], ptr [[OUT2]], align 8
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
-; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_70-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64
-; SM_70-NEXT: store i64 [[I]], ptr [[OUT2]], align 8
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
+; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
+; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
+; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; LOWER-ARGS-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64
+; LOWER-ARGS-NEXT: store i64 [[I]], ptr [[OUT2]], align 8
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -399,6 +438,27 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl
; COPY-NEXT: store i64 [[I]], ptr [[OUT]], align 8
; COPY-NEXT: ret void
;
+; PTX-LABEL: escape_ptrtoint(
+; PTX: {
+; PTX-NEXT: .local .align 4 .b8 __local_depot8[8];
+; PTX-NEXT: .reg .b64 %SP;
+; PTX-NEXT: .reg .b64 %SPL;
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<5>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %entry
+; PTX-NEXT: mov.b64 %SPL, __local_depot8;
+; PTX-NEXT: cvta.local.u64 %SP, %SPL;
+; PTX-NEXT: ld.param.u64 %rd1, [escape_ptrtoint_param_0];
+; PTX-NEXT: add.u64 %rd2, %SP, 0;
+; PTX-NEXT: add.u64 %rd3, %SPL, 0;
+; PTX-NEXT: ld.param.u32 %r1, [escape_ptrtoint_param_1+4];
+; PTX-NEXT: st.local.u32 [%rd3+4], %r1;
+; PTX-NEXT: ld.param.u32 %r2, [escape_ptrtoint_param_1];
+; PTX-NEXT: st.local.u32 [%rd3], %r2;
+; PTX-NEXT: cvta.to.global.u64 %rd4, %rd1;
+; PTX-NEXT: st.global.u64 [%rd4], %rd2;
+; PTX-NEXT: ret;
entry:
%i = ptrtoint ptr %s to i64
store i64 %i, ptr %out, align 8
@@ -407,23 +467,14 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
-; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
-; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
+; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
+; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -434,6 +485,46 @@ define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeo
; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true)
; COPY-NEXT: ret void
;
+; PTX-LABEL: memcpy_from_param(
+; PTX: {
+; PTX-NEXT: .reg .b16 %rs<17>;
+; PTX-NEXT: .reg .b64 %rd<2>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %entry
+; PTX-NEXT: ld.param.u64 %rd1, [memcpy_from_param_param_0];
+; PTX-NEXT: ld.param.u8 %rs1, [memcpy_from_param_param_1+15];
+; PTX-NEXT: st.volatile.u8 [%rd1+15], %rs1;
+; PTX-NEXT: ld.param.u8 %rs2, [memcpy_from_param_param_1+14];
+; PTX-NEXT: st.volatile.u8 [%rd1+14], %rs2;
+; PTX-NEXT: ld.param.u8 %rs3, [memcpy_from_param_param_1+13];
+; PTX-NEXT: st.volatile.u8 [%rd1+13], %rs3;
+; PTX-NEXT: ld.param.u8 %rs4, [memcpy_from_param_param_1+12];
+; PTX-NEXT: st.volatile.u8 [%rd1+12], %rs4;
+; PTX-NEXT: ld.param.u8 %rs5, [memcpy_from_param_param_1+11];
+; PTX-NEXT: st.volatile.u8 [%rd1+11], %rs5;
+; PTX-NEXT: ld.param.u8 %rs6, [memcpy_from_param_param_1+10];
+; PTX-NEXT: st.volatile.u8 [%rd1+10], %rs6;
+; PTX-NEXT: ld.param.u8 %rs7, [memcpy_from_param_param_1+9];
+; PTX-NEXT: st.volatile.u8 [%rd1+9], %rs7;
+; PTX-NEXT: ld.param.u8 %rs8, [memcpy_from_param_param_1+8];
+; PTX-NEXT: st.volatile.u8 [%rd1+8], %rs8;
+; PTX-NEXT: ld.param.u8 %rs9, [memcpy_from_param_param_1+7];
+; PTX-NEXT: st.volatile.u8 [%rd1+7], %rs9;
+; PTX-NEXT: ld.param.u8 %rs10, [memcpy_from_param_param_1+6];
+; PTX-NEXT: st.volatile.u8 [%rd1+6], %rs10;
+; PTX-NEXT: ld.param.u8 %rs11, [memcpy_from_param_param_1+5];
+; PTX-NEXT: st.volatile.u8 [%rd1+5], %rs11;
+; PTX-NEXT: ld.param.u8 %rs12, [memcpy_from_param_param_1+4];
+; PTX-NEXT: st.volatile.u8 [%rd1+4], %rs12;
+; PTX-NEXT: ld.param.u8 %rs13, [memcpy_from_param_param_1+3];
+; PTX-NEXT: st.volatile.u8 [%rd1+3], %rs13;
+; PTX-NEXT: ld.param.u8 %rs14, [memcpy_from_param_param_1+2];
+; PTX-NEXT: st.volatile.u8 [%rd1+2], %rs14;
+; PTX-NEXT: ld.param.u8 %rs15, [memcpy_from_param_param_1+1];
+; PTX-NEXT: st.volatile.u8 [%rd1+1], %rs15;
+; PTX-NEXT: ld.param.u8 %rs16, [memcpy_from_param_param_1];
+; PTX-NEXT: st.volatile.u8 [%rd1], %rs16;
+; PTX-NEXT: ret;
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true)
ret void
@@ -441,23 +532,14 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
-; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
-; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
+; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
+; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -468,6 +550,46 @@ define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture nound
; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true)
; COPY-NEXT: ret void
;
+; PTX-LABEL: memcpy_from_param_noalign(
+; PTX: {
+; PTX-NEXT: .reg .b16 %rs<17>;
+; PTX-NEXT: .reg .b64 %rd<2>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %entry
+; PTX-NEXT: ld.param.u64 %rd1, [memcpy_from_param_noalign_param_0];
+; PTX-NEXT: ld.param.u8 %rs1, [memcpy_from_param_noalign_param_1+15];
+; PTX-NEXT: st.volatile.u8 [%rd1+15], %rs1;
+; PTX-NEXT: ld.param.u8 %rs2, [memcpy_from_param_noalign_param_1+14];
+; PTX-NEXT: st.volatile.u8 [%rd1+14], %rs2;
+; PTX-NEXT: ld.param.u8 %rs3, [memcpy_from_param_noalign_param_1+13];
+; PTX-NEXT: st.volatile.u8 [%rd1+13], %rs3;
+; PTX-NEXT: ld.param.u8 %rs4, [memcpy_from_param_noalign_param_1+12];
+; PTX-NEXT: st.volatile.u8 [%rd1+12], %rs4;
+; PTX-NEXT: ld.param.u8 %rs5, [memcpy_from_param_noalign_param_1+11];
+; PTX-NEXT: st.volatile.u8 [%rd1+11], %rs5;
+; PTX-NEXT: ld.param.u8 %rs6, [memcpy_from_param_noalign_param_1+10];
+; PTX-NEXT: st.volatile.u8 [%rd1+10], %rs6;
+; PTX-NEXT: ld.param.u8 %rs7, [memcpy_from_param_noalign_param_1+9];
+; PTX-NEXT: st.volatile.u8 [%rd1+9], %rs7;
+; PTX-NEXT: ld.param.u8 %rs8, [memcpy_from_param_noalign_param_1+8];
+; PTX-NEXT: st.volatile.u8 [%rd1+8], %rs8;
+; PTX-NEXT: ld.param.u8 %rs9, [memcpy_from_param_noalign_param_1+7];
+; PTX-NEXT: st.volatile.u8 [%rd1+7], %rs9;
+; PTX-NEXT: ld.param.u8 %rs10, [memcpy_from_param_noalign_param_1+6];
+; PTX-NEXT: st.volatile.u8 [%rd1+6], %rs10;
+; PTX-NEXT: ld.param.u8 %rs11, [memcpy_from_param_noalign_param_1+5];
+; PTX-NEXT: st.volatile.u8 [%rd1+5], %rs11;
+; PTX-NEXT: ld.param.u8 %rs12, [memcpy_from_param_noalign_param_1+4];
+; PTX-NEXT: st.volatile.u8 [%rd1+4], %rs12;
+; PTX-NEXT: ld.param.u8 %rs13, [memcpy_from_param_noalign_param_1+3];
+; PTX-NEXT: st.volatile.u8 [%rd1+3], %rs13;
+; PTX-NEXT: ld.param.u8 %rs14, [memcpy_from_param_noalign_param_1+2];
+; PTX-NEXT: st.volatile.u8 [%rd1+2], %rs14;
+; PTX-NEXT: ld.param.u8 %rs15, [memcpy_from_param_noalign_param_1+1];
+; PTX-NEXT: st.volatile.u8 [%rd1+1], %rs15;
+; PTX-NEXT: ld.param.u8 %rs16, [memcpy_from_param_noalign_param_1];
+; PTX-NEXT: st.volatile.u8 [%rd1], %rs16;
+; PTX-NEXT: ret;
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true)
ret void
@@ -475,27 +597,16 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
-; SM_60-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_60-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
-; SM_60-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
-; SM_60-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
-; SM_70-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_70-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
-; SM_70-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
-; SM_70-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
+; LOWER-ARGS-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
+; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
+; LOWER-ARGS-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
+; LOWER-ARGS-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
; COPY-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -506,6 +617,70 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly
; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true)
; COPY-NEXT: ret void
;
+; PTX-LABEL: memcpy_to_param(
+; PTX: {
+; PTX-NEXT: .local .align 8 .b8 __local_depot11[8];
+; PTX-NEXT: .reg .b64 %SP;
+; PTX-NEXT: .reg .b64 %SPL;
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<48>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %entry
+; PTX-NEXT: mov.b64 %SPL, __local_depot11;
+; PTX-NEXT: cvta.local.u64 %SP, %SPL;
+; PTX-NEXT: ld.param.u64 %rd1, [memcpy_to_param_param_0];
+; PTX-NEXT: add.u64 %rd3, %SPL, 0;
+; PTX-NEXT: ld.param.u32 %r1, [memcpy_to_param_param_1+4];
+; PTX-NEXT: st.local.u32 [%rd3+4], %r1;
+; PTX-NEXT: ld.param.u32 %r2, [memcpy_to_param_param_1];
+; PTX-NEXT: st.local.u32 [%rd3], %r2;
+; PTX-NEXT: ld.volatile.u8 %rd4, [%rd1];
+; PTX-NEXT: ld.volatile.u8 %rd5, [%rd1+1];
+; PTX-NEXT: shl.b64 %rd6, %rd5, 8;
+; PTX-NEXT: or.b64 %rd7, %rd6, %rd4;
+; PTX-NEXT: ld.volatile.u8 %rd8, [%rd1+2];
+; PTX-NEXT: shl.b64 %rd9, %rd8, 16;
+; PTX-NEXT: ld.volatile.u8 %rd10, [%rd1+3];
+; PTX-NEXT: shl.b64 %rd11, %rd10, 24;
+; PTX-NEXT: or.b64 %rd12, %rd11, %rd9;
+; PTX-NEXT: or.b64 %rd13, %rd12, %rd7;
+; PTX-NEXT: ld.volatile.u8 %rd14, [%rd1+4];
+; PTX-NEXT: ld.volatile.u8 %rd15, [%rd1+5];
+; PTX-NEXT: shl.b64 %rd16, %rd15, 8;
+; PTX-NEXT: or.b64 %rd17, %rd16, %rd14;
+; PTX-NEXT: ld.volatile.u8 %rd18, [%rd1+6];
+; PTX-NEXT: shl.b64 %rd19, %rd18, 16;
+; PTX-NEXT: ld.volatile.u8 %rd20, [%rd1+7];
+; PTX-NEXT: shl.b64 %rd21, %rd20, 24;
+; PTX-NEXT: or.b64 %rd22, %rd21, %rd19;
+; PTX-NEXT: or.b64 %rd23, %rd22, %rd17;
+; PTX-NEXT: shl.b64 %rd24, %rd23, 32;
+; PTX-NEXT: or.b64 %rd25, %rd24, %rd13;
+; PTX-NEXT: st.volatile.u64 [%SP], %rd25;
+; PTX-NEXT: ld.volatile.u8 %rd26, [%rd1+8];
+; PTX-NEXT: ld.volatile.u8 %rd27, [%rd1+9];
+; PTX-NEXT: shl.b64 %rd28, %rd27, 8;
+; PTX-NEXT: or.b64 %rd29, %rd28, %rd26;
+; PTX-NEXT: ld.volatile.u8 %rd30, [%rd1+10];
+; PTX-NEXT: shl.b64 %rd31, %rd30, 16;
+; PTX-NEXT: ld.volatile.u8 %rd32, [%rd1+11];
+; PTX-NEXT: shl.b64 %rd33, %rd32, 24;
+; PTX-NEXT: or.b64 %rd34, %rd33, %rd31;
+; PTX-NEXT: or.b64 %rd35, %rd34, %rd29;
+; PTX-NEXT: ld.volatile.u8 %rd36, [%rd1+12];
+; PTX-NEXT: ld.volatile.u8 %rd37, [%rd1+13];
+; PTX-NEXT: shl.b64 %rd38, %rd37, 8;
+; PTX-NEXT: or.b64 %rd39, %rd38, %rd36;
+; PTX-NEXT: ld.volatile.u8 %rd40, [%rd1+14];
+; PTX-NEXT: shl.b64 %rd41, %rd40, 16;
+; PTX-NEXT: ld.volatile.u8 %rd42, [%rd1+15];
+; PTX-NEXT: shl.b64 %rd43, %rd42, 24;
+; PTX-NEXT: or.b64 %rd44, %rd43, %rd41;
+; PTX-NEXT: or.b64 %rd45, %rd44, %rd39;
+; PTX-NEXT: shl.b64 %rd46, %rd45, 32;
+; PTX-NEXT: or.b64 %rd47, %rd46, %rd35;
+; PTX-NEXT: st.volatile.u64 [%SP+8], %rd47;
+; PTX-NEXT: ret;
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true)
ret void
@@ -513,29 +688,17 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @copy_on_store(
-; SM_60-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[BB:.*:]]
-; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_60-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
-; SM_60-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
-; SM_60-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4
-; SM_60-NEXT: store i32 [[I]], ptr [[S3]], align 4
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @copy_on_store(
-; SM_70-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[BB:.*:]]
-; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; SM_70-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
-; SM_70-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
-; SM_70-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4
-; SM_70-NEXT: store i32 [[I]], ptr [[S3]], align 4
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @copy_on_store(
+; LOWER-ARGS-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; LOWER-ARGS-NEXT: [[BB:.*:]]
+; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
+; LOWER-ARGS-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
+; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4
+; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[S3]], align 4
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @copy_on_store(
; COPY-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -547,6 +710,12 @@ define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly %
; COPY-NEXT: store i32 [[I]], ptr [[S1]], align 4
; COPY-NEXT: ret void
;
+; PTX-LABEL: copy_on_store(
+; PTX: {
+; PTX-EMPTY:
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %bb
+; PTX-NEXT: ret;
bb:
%i = load i32, ptr %in, align 4
store i32 %i, ptr %s, align 4
@@ -598,6 +767,48 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; COPY-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4
; COPY-NEXT: ret void
;
+; PTX_60-LABEL: test_select(
+; PTX_60: {
+; PTX_60-NEXT: .reg .pred %p<2>;
+; PTX_60-NEXT: .reg .b16 %rs<3>;
+; PTX_60-NEXT: .reg .b32 %r<4>;
+; PTX_60-NEXT: .reg .b64 %rd<3>;
+; PTX_60-EMPTY:
+; PTX_60-NEXT: // %bb.0: // %bb
+; PTX_60-NEXT: ld.param.u8 %rs1, [test_select_param_3];
+; PTX_60-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX_60-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; PTX_60-NEXT: ld.param.u64 %rd1, [test_select_param_2];
+; PTX_60-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX_60-NEXT: ld.param.u32 %r1, [test_select_param_1];
+; PTX_60-NEXT: ld.param.u32 %r2, [test_select_param_0];
+; PTX_60-NEXT: selp.b32 %r3, %r2, %r1, %p1;
+; PTX_60-NEXT: st.global.u32 [%rd2], %r3;
+; PTX_60-NEXT: ret;
+;
+; PTX_70-LABEL: test_select(
+; PTX_70: {
+; PTX_70-NEXT: .reg .pred %p<2>;
+; PTX_70-NEXT: .reg .b16 %rs<3>;
+; PTX_70-NEXT: .reg .b32 %r<2>;
+; PTX_70-NEXT: .reg .b64 %rd<10>;
+; PTX_70-EMPTY:
+; PTX_70-NEXT: // %bb.0: // %bb
+; PTX_70-NEXT: ld.param.u8 %rs1, [test_select_param_3];
+; PTX_70-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX_70-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; PTX_70-NEXT: mov.b64 %rd1, test_select_param_0;
+; PTX_70-NEXT: ld.param.u64 %rd2, [test_select_param_2];
+; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2;
+; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1;
+; PTX_70-NEXT: mov.b64 %rd5, %rd4;
+; PTX_70-NEXT: cvta.param.u64 %rd6, %rd5;
+; PTX_70-NEXT: mov.b64 %rd7, %rd1;
+; PTX_70-NEXT: cvta.param.u64 %rd8, %rd7;
+; PTX_70-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1;
+; PTX_70-NEXT: ld.u32 %r1, [%rd9];
+; PTX_70-NEXT: st.global.u32 [%rd3], %r1;
+; PTX_70-NEXT: ret;
bb:
%ptrnew = select i1 %cond, ptr %input1, ptr %input2
%valloaded = load i32, ptr %ptrnew, align 4
@@ -606,35 +817,20 @@ bb:
}
define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
-; SM_60-LABEL: define ptx_kernel void @test_select_write(
-; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
-; SM_60-NEXT: [[BB:.*:]]
-; SM_60-NEXT: [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[OUT6:%.*]] = addrspacecast ptr addrspace(1) [[OUT5]] to ptr
-; SM_60-NEXT: [[INPUT23:%.*]] = alloca i32, align 4
-; SM_60-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
-; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
-; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
-; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
-; SM_60-NEXT: store i32 1, ptr [[PTRNEW]], align 4
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define ptx_kernel void @test_select_write(
-; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
-; SM_70-NEXT: [[BB:.*:]]
-; SM_70-NEXT: [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[OUT6:%.*]] = addrspacecast ptr addrspace(1) [[OUT5]] to ptr
-; SM_70-NEXT: [[INPUT23:%.*]] = alloca i32, align 4
-; SM_70-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
-; SM_70-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
-; SM_70-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
-; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
-; SM_70-NEXT: store i32 1, ptr [[PTRNEW]], align 4
-; SM_70-NEXT: ret void
+; LOWER-ARGS-LABEL: define ptx_kernel void @test_select_write(
+; LOWER-ARGS-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
+; LOWER-ARGS-NEXT: [[BB:.*:]]
+; LOWER-ARGS-NEXT: [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; LOWER-ARGS-NEXT: [[OUT6:%.*]] = addrspacecast ptr addrspace(1) [[OUT5]] to ptr
+; LOWER-ARGS-NEXT: [[INPUT23:%.*]] = alloca i32, align 4
+; LOWER-ARGS-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
+; LOWER-ARGS-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
+; LOWER-ARGS-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
+; LOWER-ARGS-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
+; LOWER-ARGS-NEXT: store i32 1, ptr [[PTRNEW]], align 4
+; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define ptx_kernel void @test_select_write(
; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
@@ -649,6 +845,32 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by
; COPY-NEXT: store i32 1, ptr [[PTRNEW]], align 4
; COPY-NEXT: ret void
;
+; PTX-LABEL: test_select_write(
+; PTX: {
+; PTX-NEXT: .local .align 4 .b8 __local_depot14[8];
+; PTX-NEXT: .reg .b64 %SP;
+; PTX-NEXT: .reg .b64 %SPL;
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b16 %rs<3>;
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %bb
+; PTX-NEXT: mov.b64 %SPL, __local_depot14;
+; PTX-NEXT: cvta.local.u64 %SP, %SPL;
+; PTX-NEXT: ld.param.u8 %rs1, [test_select_write_param_3];
+; PTX-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; PTX-NEXT: ld.param.u32 %r1, [test_select_write_param_1];
+; PTX-NEXT: st.u32 [%SP], %r1;
+; PTX-NEXT: ld.param.u32 %r2, [test_select_write_param_0];
+; PTX-NEXT: st.u32 [%SP+4], %r2;
+; PTX-NEXT: add.u64 %rd2, %SPL, 4;
+; PTX-NEXT: add.u64 %rd4, %SPL, 0;
+; PTX-NEXT: selp.b64 %rd5, %rd2, %rd4, %p1;
+; PTX-NEXT: mov.b32 %r3, 1;
+; PTX-NEXT: st.local.u32 [%rd5], %r3;
+; PTX-NEXT: ret;
bb:
%ptrnew = select i1 %cond, ptr %input1, ptr %input2
store i32 1, ptr %ptrnew, align 4
@@ -724,6 +946,53 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; COPY-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4
; COPY-NEXT: ret void
;
+; PTX_60-LABEL: test_phi(
+; PTX_60: {
+; PTX_60-NEXT: .reg .pred %p<2>;
+; PTX_60-NEXT: .reg .b16 %rs<3>;
+; PTX_60-NEXT: .reg .b32 %r<5>;
+; PTX_60-NEXT: .reg .b64 %rd<3>;
+; PTX_60-EMPTY:
+; PTX_60-NEXT: // %bb.0: // %bb
+; PTX_60-NEXT: ld.param.u8 %rs1, [test_phi_param_3];
+; PTX_60-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX_60-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; PTX_60-NEXT: ld.param.u64 %rd2, [test_phi_param_2];
+; PTX_60-NEXT: cvta.to.global.u64 %rd1, %rd2;
+; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_0];
+; PTX_60-NEXT: @%p1 bra $L__BB15_2;
+; PTX_60-NEXT: // %bb.1: // %second
+; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_1+4];
+; PTX_60-NEXT: $L__BB15_2: // %merge
+; PTX_60-NEXT: st.global.u32 [%rd1], %r4;
+; PTX_60-NEXT: ret;
+;
+; PTX_70-LABEL: test_phi(
+; PTX_70: {
+; PTX_70-NEXT: .reg .pred %p<2>;
+; PTX_70-NEXT: .reg .b16 %rs<3>;
+; PTX_70-NEXT: .reg .b32 %r<2>;
+; PTX_70-NEXT: .reg .b64 %rd<12>;
+; PTX_70-EMPTY:
+; PTX_70-NEXT: // %bb.0: // %bb
+; PTX_70-NEXT: ld.param.u8 %rs1, [test_phi_param_3];
+; PTX_70-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX_70-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; PTX_70-NEXT: mov.b64 %rd6, test_phi_param_0;
+; PTX_70-NEXT: ld.param.u64 %rd7, [test_phi_param_2];
+; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd7;
+; PTX_70-NEXT: mov.b64 %rd10, %rd6;
+; PTX_70-NEXT: cvta.param.u64 %rd11, %rd10;
+; PTX_70-NEXT: @%p1 bra $L__BB15_2;
+; PTX_70-NEXT: // %bb.1: // %second
+; PTX_70-NEXT: mov.b64 %rd8, test_phi_param_1;
+; PTX_70-NEXT: mov.b64 %rd9, %rd8;
+; PTX_70-NEXT: cvta.param.u64 %rd2, %rd9;
+; PTX_70-NEXT: add.s64 %rd11, %rd2, 4;
+; PTX_70-NEXT: $L__BB15_2: // %merge
+; PTX_70-NEXT: ld.u32 %r1, [%rd11];
+; PTX_70-NEXT: st.global.u32 [%rd1], %r1;
+; PTX_70-NEXT: ret;
bb:
br i1 %cond, label %first, label %second
@@ -764,6 +1033,35 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr
; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4
; COMMON-NEXT: ret void
;
+; PTX-LABEL: test_phi_write(
+; PTX: {
+; PTX-NEXT: .local .align 4 .b8 __local_depot16[8];
+; PTX-NEXT: .reg .b64 %SP;
+; PTX-NEXT: .reg .b64 %SPL;
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b16 %rs<3>;
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b64 %rd<7>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0: // %bb
+; PTX-NEXT: mov.b64 %SPL, __local_depot16;
+; PTX-NEXT: cvta.local.u64 %SP, %SPL;
+; PTX-NEXT: ld.param.u8 %rs1, [test_phi_write_param_2];
+; PTX-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; PTX-NEXT: add.u64 %rd1, %SPL, 0;
+; PTX-NEXT: ld.param.u32 %r1, [test_phi_write_param_1+4];
+; PTX-NEXT: st.u32 [%SP], %r1;
+; PTX-NEXT: add.u64 %rd6, %SPL, 4;
+; PTX-NEXT: ld.param.u32 %r2, [test_phi_write_param_0];
+; PTX-NEXT: st.u32 [%SP+4], %r2;
+; PTX-NEXT: @%p1 bra $L__BB16_2;
+; PTX-NEXT: // %bb.1: // %second
+; PTX-NEXT: mov.b64 %rd6, %rd1;
+; PTX-NEXT: $L__BB16_2: // %merge
+; PTX-NEXT: mov.b32 %r3, 1;
+; PTX-NEXT: st.local.u32 [%rd6], %r3;
+; PTX-NEXT: ret;
bb:
br i1 %cond, label %first, label %second
diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll
new file mode 100644
index 0000000000000..634e77a9c459e
--- /dev/null
+++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=infer-address-spaces %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+
+define ptx_kernel i32 @test_kernel(ptr %a, ptr byval(i32) %b) {
+; CHECK-LABEL: define ptx_kernel i32 @test_kernel(
+; CHECK-SAME: ptr [[A:%.*]], ptr byval(i32) [[B:%.*]]) {
+; CHECK-NEXT: [[V1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT: [[SUM:%.*]] = add i32 [[V1]], [[V2]]
+; CHECK-NEXT: ret i32 [[SUM]]
+;
+ %v1 = load i32, ptr %a
+ %v2 = load i32, ptr %b
+ %sum = add i32 %v1, %v2
+ ret i32 %sum
+}
+
+define i32 @test_device(ptr %a, ptr byval(i32) %b) {
+; CHECK-LABEL: define i32 @test_device(
+; CHECK-SAME: ptr [[A:%.*]], ptr byval(i32) [[B:%.*]]) {
+; CHECK-NEXT: [[V1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT: [[SUM:%.*]] = add i32 [[V1]], [[V2]]
+; CHECK-NEXT: ret i32 [[SUM]]
+;
+ %v1 = load i32, ptr %a
+ %v2 = load i32, ptr %b
+ %sum = add i32 %v1, %v2
+ ret i32 %sum
+}
>From 519bed1d122c3ab44369e6410103f8d1d1b88bb6 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Thu, 3 Apr 2025 15:45:45 +0000
Subject: [PATCH 2/2] [InferAS] Support getAssumedAddrSpace for Arguments for
NVPTX
---
llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 12 +-
.../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 15 +
.../Transforms/Scalar/InferAddressSpaces.cpp | 49 ++-
llvm/test/CodeGen/NVPTX/i1-ext-load.ll | 10 +-
.../CodeGen/NVPTX/lower-args-gridconstant.ll | 46 +--
llvm/test/CodeGen/NVPTX/lower-args.ll | 34 +-
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 361 +++++++-----------
llvm/test/DebugInfo/NVPTX/debug-addr-class.ll | 53 +--
llvm/test/DebugInfo/NVPTX/debug-info.ll | 80 ++--
.../InferAddressSpaces/NVPTX/arguments.ll | 6 +-
10 files changed, 298 insertions(+), 368 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 2637b9fab0d50..a683726facd0c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -678,11 +678,8 @@ static bool runOnKernelFunction(const NVPTXTargetMachine &TM, Function &F) {
LLVM_DEBUG(dbgs() << "Lowering kernel args of " << F.getName() << "\n");
for (Argument &Arg : F.args()) {
- if (Arg.getType()->isPointerTy()) {
- if (Arg.hasByValAttr())
- handleByValParam(TM, &Arg);
- else if (TM.getDrvInterface() == NVPTX::CUDA)
- markPointerAsGlobal(&Arg);
+ if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) {
+ handleByValParam(TM, &Arg);
} else if (Arg.getType()->isIntegerTy() &&
TM.getDrvInterface() == NVPTX::CUDA) {
HandleIntToPtr(Arg);
@@ -699,10 +696,9 @@ static bool runOnDeviceFunction(const NVPTXTargetMachine &TM, Function &F) {
cast<NVPTXTargetLowering>(TM.getSubtargetImpl()->getTargetLowering());
for (Argument &Arg : F.args())
- if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) {
- markPointerAsAS(&Arg, ADDRESS_SPACE_LOCAL);
+ if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
adjustByValArgAlignment(&Arg, &Arg, TLI);
- }
+
return true;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index a89ca3037c7ff..e359735c20750 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -599,6 +599,21 @@ unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const {
if (isa<AllocaInst>(V))
return ADDRESS_SPACE_LOCAL;
+ if (const Argument *Arg = dyn_cast<Argument>(V)) {
+ if (isKernelFunction(*Arg->getParent())) {
+ const NVPTXTargetMachine &TM =
+ static_cast<const NVPTXTargetMachine &>(getTLI()->getTargetMachine());
+ if (TM.getDrvInterface() == NVPTX::CUDA && !Arg->hasByValAttr())
+ return ADDRESS_SPACE_GLOBAL;
+ } else {
+ // We assume that all device parameters that are passed byval will be
+ // placed in the local AS. Very simple cases will be updated after ISel to
+ // use the device param space where possible.
+ if (Arg->hasByValAttr())
+ return ADDRESS_SPACE_LOCAL;
+ }
+ }
+
return -1;
}
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 73a3f5e4d3694..b65a08be75640 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -305,10 +305,16 @@ static bool isNoopPtrIntCastPair(const Operator *I2P, const DataLayout &DL,
}
// Returns true if V is an address expression.
-// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
-// getelementptr operators.
+// TODO: Currently, we only consider:
+// - arguments
+// - phi, bitcast, addrspacecast, and getelementptr operators
static bool isAddressExpression(const Value &V, const DataLayout &DL,
const TargetTransformInfo *TTI) {
+
+ if (const Argument *Arg = dyn_cast<Argument>(&V))
+ return Arg->getType()->isPointerTy() &&
+ TTI->getAssumedAddrSpace(&V) != UninitializedAddressSpace;
+
const Operator *Op = dyn_cast<Operator>(&V);
if (!Op)
return false;
@@ -341,6 +347,9 @@ static bool isAddressExpression(const Value &V, const DataLayout &DL,
static SmallVector<Value *, 2>
getPointerOperands(const Value &V, const DataLayout &DL,
const TargetTransformInfo *TTI) {
+ if (isa<Argument>(&V))
+ return {};
+
const Operator &Op = cast<Operator>(V);
switch (Op.getOpcode()) {
case Instruction::PHI: {
@@ -505,13 +514,11 @@ void InferAddressSpacesImpl::appendsFlatAddressExpressionToPostorderStack(
if (Visited.insert(V).second) {
PostorderStack.emplace_back(V, false);
- Operator *Op = cast<Operator>(V);
- for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) {
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op->getOperand(I))) {
- if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second)
- PostorderStack.emplace_back(CE, false);
- }
- }
+ if (auto *Op = dyn_cast<Operator>(V))
+ for (auto &O : Op->operands())
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(O))
+ if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second)
+ PostorderStack.emplace_back(CE, false);
}
}
}
@@ -828,6 +835,18 @@ Value *InferAddressSpacesImpl::cloneValueWithNewAddressSpace(
assert(V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
isAddressExpression(*V, *DL, TTI));
+ if (auto *Arg = dyn_cast<Argument>(V)) {
+ // Arguments are address space casted in the function body, as we do not
+ // want to change the function signature.
+ Function *F = Arg->getParent();
+ BasicBlock::iterator Insert = F->getEntryBlock().getFirstNonPHIIt();
+
+ Type *NewPtrTy = PointerType::get(Arg->getContext(), NewAddrSpace);
+ auto *NewI = new AddrSpaceCastInst(Arg, NewPtrTy);
+ NewI->insertBefore(Insert);
+ return NewI;
+ }
+
if (Instruction *I = dyn_cast<Instruction>(V)) {
Value *NewV = cloneInstructionWithNewAddressSpace(
I, NewAddrSpace, ValueWithNewAddrSpace, PredicatedAS, PoisonUsesToFix);
@@ -966,8 +985,12 @@ bool InferAddressSpacesImpl::updateAddressSpace(
// of all its pointer operands.
unsigned NewAS = UninitializedAddressSpace;
- const Operator &Op = cast<Operator>(V);
- if (Op.getOpcode() == Instruction::Select) {
+ // isAddressExpression should guarantee that V is an operator or an argument.
+ assert(isa<Operator>(V) || isa<Argument>(V));
+
+ if (isa<Operator>(V) &&
+ cast<Operator>(V).getOpcode() == Instruction::Select) {
+ const Operator &Op = cast<Operator>(V);
Value *Src0 = Op.getOperand(1);
Value *Src1 = Op.getOperand(2);
@@ -1258,7 +1281,7 @@ void InferAddressSpacesImpl::performPointerReplacement(
}
// Otherwise, replaces the use with flat(NewV).
- if (Instruction *VInst = dyn_cast<Instruction>(V)) {
+ if (isa<Instruction>(V) || isa<Instruction>(NewV)) {
// Don't create a copy of the original addrspacecast.
if (U == V && isa<AddrSpaceCastInst>(V))
return;
@@ -1268,7 +1291,7 @@ void InferAddressSpacesImpl::performPointerReplacement(
if (Instruction *NewVInst = dyn_cast<Instruction>(NewV))
InsertPos = std::next(NewVInst->getIterator());
else
- InsertPos = std::next(VInst->getIterator());
+ InsertPos = std::next(cast<Instruction>(V)->getIterator());
while (isa<PHINode>(InsertPos))
++InsertPos;
diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
index f5f1dd9fcf0ea..44ac46db254a7 100644
--- a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
@@ -12,14 +12,14 @@ define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
; CHECK: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK: ld.param.u64 %rd1, [foo_param_0];
-; CHECK: ld.param.u64 %rd2, [foo_param_1];
-; CHECK: cvta.to.global.u64 %rd3, %rd2;
-; CHECK: cvta.to.global.u64 %rd4, %rd1;
-; CHECK: ld.global.nc.u8 %rs1, [%rd4];
+; CHECK: cvta.to.global.u64 %rd2, %rd1;
+; CHECK: ld.param.u64 %rd3, [foo_param_1];
+; CHECK: cvta.to.global.u64 %rd4, %rd3;
+; CHECK: ld.global.nc.u8 %rs1, [%rd2];
; CHECK: cvt.u32.u8 %r1, %rs1;
; CHECK: add.s32 %r2, %r1, 1;
; CHECK: and.b32 %r3, %r2, 1;
-; CHECK: st.global.u32 [%rd3], %r3;
+; CHECK: st.global.u32 [%rd4], %r3;
; CHECK: ret;
%ld = load i1, ptr %ptr, align 1
%zext = zext i1 %ld to i32
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index e4e1f40d0d8b2..38b7400696c54 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -12,9 +12,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
; OPT-LABEL: define dso_local noundef i32 @non_kernel_function(
; OPT-SAME: ptr noundef readonly byval([[STRUCT_UINT4:%.*]]) align 16 captures(none) [[A:%.*]], i1 noundef zeroext [[B:%.*]], i32 noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; OPT-NEXT: [[ENTRY:.*:]]
-; OPT-NEXT: [[A2:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(5)
-; OPT-NEXT: [[A1:%.*]] = addrspacecast ptr addrspace(5) [[A2]] to ptr
-; OPT-NEXT: [[A_:%.*]] = select i1 [[B]], ptr [[A1]], ptr addrspacecast (ptr addrspace(1) @gi to ptr)
+; OPT-NEXT: [[A_:%.*]] = select i1 [[B]], ptr [[A]], ptr addrspacecast (ptr addrspace(1) @gi to ptr)
; OPT-NEXT: [[IDX_EXT:%.*]] = sext i32 [[C]] to i64
; OPT-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[A_]], i64 [[IDX_EXT]]
; OPT-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 1
@@ -74,12 +72,10 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_int(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; OPT-NEXT: [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr
; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
-; OPT-NEXT: store i32 [[ADD]], ptr [[OUT3]], align 4
+; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4
; OPT-NEXT: ret void
%tmp = load i32, ptr %input1, align 4
%add = add i32 %tmp, %input2
@@ -105,15 +101,13 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_struct(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; OPT-NEXT: [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]]
-; OPT-NEXT: store i32 [[ADD]], ptr [[OUT5]], align 4
+; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4
; OPT-NEXT: ret void
%gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
@@ -233,11 +227,9 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
-; OPT-NEXT: [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr
; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR5]], align 8
+; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR]], align 8
; OPT-NEXT: ret void
store ptr %input, ptr %addr, align 8
ret void
@@ -263,14 +255,12 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4
; PTX-NOT .local
; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1)
-; OPT-NEXT: [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr
; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
-; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT5]], align 8
+; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT]], align 8
; OPT-NEXT: ret void
%tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
@@ -311,13 +301,11 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape(
; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
-; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
; OPT-NEXT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4
; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]]
-; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT5]], align 4
+; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT]], align 4
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
; OPT-NEXT: ret void
%val = load i32, ptr %input
@@ -361,15 +349,13 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input,
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
-; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4
; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
; OPT-NEXT: [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4
-; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT5]], align 8
+; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT]], align 8
; OPT-NEXT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
; OPT-NEXT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
; OPT-NEXT: ret i32 [[ADD]]
@@ -407,11 +393,9 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_phi(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
-; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
-; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; OPT: [[FIRST]]:
@@ -423,7 +407,7 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr
; OPT: [[MERGE]]:
; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4
; OPT-NEXT: ret void
%val = load i32, ptr %inout
@@ -470,13 +454,11 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1,
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
-; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
-; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; OPT: [[FIRST]]:
@@ -488,7 +470,7 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1,
; OPT: [[MERGE]]:
; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4
; OPT-NEXT: ret void
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
@@ -531,17 +513,15 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_select(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
-; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
-; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4
; OPT-NEXT: ret void
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index a1c0a86e9c4e4..8fa7d5c3e0cbc 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes IR,IRC
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes IR,IRO
; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX,PTXC
@@ -17,12 +17,10 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %
; IR-LABEL: define void @load_alignment(
; IR-SAME: ptr readonly byval([[CLASS_OUTER:%.*]]) align 8 captures(none) [[ARG:%.*]]) {
; IR-NEXT: [[ENTRY:.*:]]
-; IR-NEXT: [[ARG2:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(5)
-; IR-NEXT: [[ARG1:%.*]] = addrspacecast ptr addrspace(5) [[ARG2]] to ptr
-; IR-NEXT: [[ARG_IDX_VAL:%.*]] = load ptr, ptr [[ARG1]], align 8
-; IR-NEXT: [[ARG_IDX1:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG1]], i64 0, i32 0, i32 1
+; IR-NEXT: [[ARG_IDX_VAL:%.*]] = load ptr, ptr [[ARG]], align 8
+; IR-NEXT: [[ARG_IDX1:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG]], i64 0, i32 0, i32 1
; IR-NEXT: [[ARG_IDX1_VAL:%.*]] = load ptr, ptr [[ARG_IDX1]], align 8
-; IR-NEXT: [[ARG_IDX2:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG1]], i64 0, i32 1
+; IR-NEXT: [[ARG_IDX2:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG]], i64 0, i32 1
; IR-NEXT: [[ARG_IDX2_VAL:%.*]] = load i32, ptr [[ARG_IDX2]], align 8
; IR-NEXT: [[ARG_IDX_VAL_VAL:%.*]] = load i32, ptr [[ARG_IDX_VAL]], align 4
; IR-NEXT: [[ADD_I:%.*]] = add nsw i32 [[ARG_IDX_VAL_VAL]], [[ARG_IDX2_VAL]]
@@ -77,9 +75,7 @@ entry:
define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
; IR-LABEL: define void @load_padding(
; IR-SAME: ptr readonly byval([[CLASS_PADDED:%.*]]) align 4 captures(none) [[ARG:%.*]]) {
-; IR-NEXT: [[ARG2:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(5)
-; IR-NEXT: [[ARG1:%.*]] = addrspacecast ptr addrspace(5) [[ARG2]] to ptr
-; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull align 16 [[ARG1]])
+; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull align 16 [[ARG]])
; IR-NEXT: ret void
;
; PTX-LABEL: load_padding(
@@ -108,21 +104,11 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
; OpenCL can't make assumptions about incoming pointer, so we should generate
; generic pointers load/store.
define ptx_kernel void @ptr_generic(ptr %out, ptr %in) {
-; IRC-LABEL: define ptx_kernel void @ptr_generic(
-; IRC-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) {
-; IRC-NEXT: [[IN3:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
-; IRC-NEXT: [[IN4:%.*]] = addrspacecast ptr addrspace(1) [[IN3]] to ptr
-; IRC-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; IRC-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; IRC-NEXT: [[V:%.*]] = load i32, ptr [[IN4]], align 4
-; IRC-NEXT: store i32 [[V]], ptr [[OUT2]], align 4
-; IRC-NEXT: ret void
-;
-; IRO-LABEL: define ptx_kernel void @ptr_generic(
-; IRO-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) {
-; IRO-NEXT: [[V:%.*]] = load i32, ptr [[IN]], align 4
-; IRO-NEXT: store i32 [[V]], ptr [[OUT]], align 4
-; IRO-NEXT: ret void
+; IR-LABEL: define ptx_kernel void @ptr_generic(
+; IR-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) {
+; IR-NEXT: [[V:%.*]] = load i32, ptr [[IN]], align 4
+; IR-NEXT: store i32 [[V]], ptr [[OUT]], align 4
+; IR-NEXT: ret void
;
; PTXC-LABEL: ptr_generic(
; PTXC: {
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 13522145678a0..4631732b81ea6 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -33,10 +33,8 @@ define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
-; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @read_only(
@@ -72,11 +70,9 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
-; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep(
@@ -114,11 +110,9 @@ define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeo
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
-; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
@@ -154,33 +148,18 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
-; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; LOWER-ARGS-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
-; LOWER-ARGS-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
-; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4
-; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
-; LOWER-ARGS-NEXT: ret void
-;
-; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
-; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
-; COPY-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; COPY-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
-; COPY-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4
-; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4
-; COPY-NEXT: ret void
+; COMMON-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
+; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
+; COMMON-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; COMMON-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
+; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4
+; COMMON-NEXT: store i32 [[I]], ptr [[OUT]], align 4
+; COMMON-NEXT: ret void
;
; PTX-LABEL: read_only_gep_asc0(
; PTX: {
@@ -189,8 +168,8 @@ define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef write
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_asc0_param_0];
-; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc0_param_1+4];
; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc0_param_1+4];
; PTX-NEXT: st.global.u32 [%rd2], %r1;
; PTX-NEXT: ret;
entry:
@@ -204,16 +183,23 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @escape_ptr(
-; LOWER-ARGS-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; LOWER-ARGS-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]])
-; LOWER-ARGS-NEXT: ret void
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr(
+; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; SM_60-NEXT: [[ENTRY:.*:]]
+; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]]
+; SM_60-NEXT: ret void
+;
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr(
+; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; SM_70-NEXT: [[ENTRY:.*:]]
+; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]]
+; SM_70-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr(
; COPY-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -221,7 +207,7 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out
; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]])
+; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]]
; COPY-NEXT: ret void
;
; PTX-LABEL: escape_ptr(
@@ -258,17 +244,25 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
-; LOWER-ARGS-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; LOWER-ARGS-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
-; LOWER-ARGS-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]])
-; LOWER-ARGS-NEXT: ret void
+; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
+; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; SM_60-NEXT: [[ENTRY:.*:]]
+; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
+; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]]
+; SM_60-NEXT: ret void
+;
+; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
+; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; SM_70-NEXT: [[ENTRY:.*:]]
+; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
+; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]]
+; SM_70-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
; COPY-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -277,7 +271,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone
; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
-; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]])
+; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]]
; COPY-NEXT: ret void
;
; PTX-LABEL: escape_ptr_gep(
@@ -316,25 +310,14 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
-; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; LOWER-ARGS-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8
-; LOWER-ARGS-NEXT: ret void
-;
-; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
-; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: store ptr [[S1]], ptr [[OUT]], align 8
-; COPY-NEXT: ret void
+; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
+; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; COMMON-NEXT: store ptr [[S1]], ptr [[OUT]], align 8
+; COMMON-NEXT: ret void
;
; PTX-LABEL: escape_ptr_store(
; PTX: {
@@ -348,14 +331,14 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon
; PTX-NEXT: mov.b64 %SPL, __local_depot6;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_store_param_0];
-; PTX-NEXT: add.u64 %rd2, %SP, 0;
-; PTX-NEXT: add.u64 %rd3, %SPL, 0;
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: add.u64 %rd3, %SP, 0;
+; PTX-NEXT: add.u64 %rd4, %SPL, 0;
; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_store_param_1+4];
-; PTX-NEXT: st.local.u32 [%rd3+4], %r1;
+; PTX-NEXT: st.local.u32 [%rd4+4], %r1;
; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_store_param_1];
-; PTX-NEXT: st.local.u32 [%rd3], %r2;
-; PTX-NEXT: cvta.to.global.u64 %rd4, %rd1;
-; PTX-NEXT: st.global.u64 [%rd4], %rd2;
+; PTX-NEXT: st.local.u32 [%rd4], %r2;
+; PTX-NEXT: st.global.u64 [%rd2], %rd3;
; PTX-NEXT: ret;
entry:
store ptr %s, ptr %out, align 8
@@ -364,27 +347,15 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
-; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; LOWER-ARGS-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
-; LOWER-ARGS-NEXT: store ptr [[B]], ptr [[OUT2]], align 8
-; LOWER-ARGS-NEXT: ret void
-;
-; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
-; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
-; COPY-NEXT: store ptr [[B]], ptr [[OUT]], align 8
-; COPY-NEXT: ret void
+; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
+; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
+; COMMON-NEXT: store ptr [[B]], ptr [[OUT]], align 8
+; COMMON-NEXT: ret void
;
; PTX-LABEL: escape_ptr_gep_store(
; PTX: {
@@ -398,15 +369,15 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri
; PTX-NEXT: mov.b64 %SPL, __local_depot7;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_gep_store_param_0];
-; PTX-NEXT: add.u64 %rd2, %SP, 0;
-; PTX-NEXT: add.u64 %rd3, %SPL, 0;
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: add.u64 %rd3, %SP, 0;
+; PTX-NEXT: add.u64 %rd4, %SPL, 0;
; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_gep_store_param_1+4];
-; PTX-NEXT: st.local.u32 [%rd3+4], %r1;
+; PTX-NEXT: st.local.u32 [%rd4+4], %r1;
; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_gep_store_param_1];
-; PTX-NEXT: st.local.u32 [%rd3], %r2;
-; PTX-NEXT: cvta.to.global.u64 %rd4, %rd1;
-; PTX-NEXT: add.s64 %rd5, %rd2, 4;
-; PTX-NEXT: st.global.u64 [%rd4], %rd5;
+; PTX-NEXT: st.local.u32 [%rd4], %r2;
+; PTX-NEXT: add.s64 %rd5, %rd3, 4;
+; PTX-NEXT: st.global.u64 [%rd2], %rd5;
; PTX-NEXT: ret;
entry:
%b = getelementptr inbounds nuw i8, ptr %s, i64 4
@@ -416,27 +387,15 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
-; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; LOWER-ARGS-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64
-; LOWER-ARGS-NEXT: store i64 [[I]], ptr [[OUT2]], align 8
-; LOWER-ARGS-NEXT: ret void
-;
-; COPY-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
-; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64
-; COPY-NEXT: store i64 [[I]], ptr [[OUT]], align 8
-; COPY-NEXT: ret void
+; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
+; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64
+; COMMON-NEXT: store i64 [[I]], ptr [[OUT]], align 8
+; COMMON-NEXT: ret void
;
; PTX-LABEL: escape_ptrtoint(
; PTX: {
@@ -450,14 +409,14 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl
; PTX-NEXT: mov.b64 %SPL, __local_depot8;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: ld.param.u64 %rd1, [escape_ptrtoint_param_0];
-; PTX-NEXT: add.u64 %rd2, %SP, 0;
-; PTX-NEXT: add.u64 %rd3, %SPL, 0;
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: add.u64 %rd3, %SP, 0;
+; PTX-NEXT: add.u64 %rd4, %SPL, 0;
; PTX-NEXT: ld.param.u32 %r1, [escape_ptrtoint_param_1+4];
-; PTX-NEXT: st.local.u32 [%rd3+4], %r1;
+; PTX-NEXT: st.local.u32 [%rd4+4], %r1;
; PTX-NEXT: ld.param.u32 %r2, [escape_ptrtoint_param_1];
-; PTX-NEXT: st.local.u32 [%rd3], %r2;
-; PTX-NEXT: cvta.to.global.u64 %rd4, %rd1;
-; PTX-NEXT: st.global.u64 [%rd4], %rd2;
+; PTX-NEXT: st.local.u32 [%rd4], %r2;
+; PTX-NEXT: st.global.u64 [%rd2], %rd3;
; PTX-NEXT: ret;
entry:
%i = ptrtoint ptr %s to i64
@@ -471,9 +430,7 @@ define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeo
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true)
; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
@@ -536,9 +493,7 @@ define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture nound
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
-; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
+; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true)
; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
@@ -597,25 +552,14 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
-; LOWER-ARGS-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; LOWER-ARGS-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
-; LOWER-ARGS-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
-; LOWER-ARGS-NEXT: ret void
-;
-; COPY-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
-; COPY-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true)
-; COPY-NEXT: ret void
+; COMMON-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
+; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true)
+; COMMON-NEXT: ret void
;
; PTX-LABEL: memcpy_to_param(
; PTX: {
@@ -688,27 +632,15 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 {
-; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @copy_on_store(
-; LOWER-ARGS-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; LOWER-ARGS-NEXT: [[BB:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
-; LOWER-ARGS-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false)
-; LOWER-ARGS-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
-; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4
-; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[S3]], align 4
-; LOWER-ARGS-NEXT: ret void
-;
-; COPY-LABEL: define dso_local ptx_kernel void @copy_on_store(
-; COPY-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; COPY-NEXT: [[BB:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4
-; COPY-NEXT: store i32 [[I]], ptr [[S1]], align 4
-; COPY-NEXT: ret void
+; COMMON-LABEL: define dso_local ptx_kernel void @copy_on_store(
+; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[BB:.*:]]
+; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4
+; COMMON-NEXT: store i32 [[I]], ptr [[S1]], align 4
+; COMMON-NEXT: ret void
;
; PTX-LABEL: copy_on_store(
; PTX: {
@@ -726,8 +658,6 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; SM_60-LABEL: define ptx_kernel void @test_select(
; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; SM_60-NEXT: [[BB:.*:]]
-; SM_60-NEXT: [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr
; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4
; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false)
@@ -736,21 +666,19 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4
+; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4
; SM_60-NEXT: ret void
;
; SM_70-LABEL: define ptx_kernel void @test_select(
; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; SM_70-NEXT: [[BB:.*:]]
-; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT2]], align 4
+; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4
; SM_70-NEXT: ret void
;
; COPY-LABEL: define ptx_kernel void @test_select(
@@ -817,33 +745,18 @@ bb:
}
define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
-; LOWER-ARGS-LABEL: define ptx_kernel void @test_select_write(
-; LOWER-ARGS-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
-; LOWER-ARGS-NEXT: [[BB:.*:]]
-; LOWER-ARGS-NEXT: [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
-; LOWER-ARGS-NEXT: [[OUT6:%.*]] = addrspacecast ptr addrspace(1) [[OUT5]] to ptr
-; LOWER-ARGS-NEXT: [[INPUT23:%.*]] = alloca i32, align 4
-; LOWER-ARGS-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
-; LOWER-ARGS-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
-; LOWER-ARGS-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
-; LOWER-ARGS-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
-; LOWER-ARGS-NEXT: store i32 1, ptr [[PTRNEW]], align 4
-; LOWER-ARGS-NEXT: ret void
-;
-; COPY-LABEL: define ptx_kernel void @test_select_write(
-; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
-; COPY-NEXT: [[BB:.*:]]
-; COPY-NEXT: [[INPUT23:%.*]] = alloca i32, align 4
-; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
-; COPY-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
-; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
-; COPY-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
-; COPY-NEXT: store i32 1, ptr [[PTRNEW]], align 4
-; COPY-NEXT: ret void
+; COMMON-LABEL: define ptx_kernel void @test_select_write(
+; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
+; COMMON-NEXT: [[BB:.*:]]
+; COMMON-NEXT: [[INPUT23:%.*]] = alloca i32, align 4
+; COMMON-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
+; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
+; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
+; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
+; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4
+; COMMON-NEXT: ret void
;
; PTX-LABEL: test_select_write(
; PTX: {
@@ -881,8 +794,6 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; SM_60-LABEL: define ptx_kernel void @test_phi(
; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; SM_60-NEXT: [[BB:.*:]]
-; SM_60-NEXT: [[INOUT7:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
-; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr
; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
@@ -899,14 +810,12 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; SM_60: [[MERGE]]:
; SM_60-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[INOUT8]], align 4
+; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4
; SM_60-NEXT: ret void
;
; SM_70-LABEL: define ptx_kernel void @test_phi(
; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; SM_70-NEXT: [[BB:.*:]]
-; SM_70-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
-; SM_70-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
@@ -921,7 +830,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; SM_70: [[MERGE]]:
; SM_70-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4
; SM_70-NEXT: ret void
;
; COPY-LABEL: define ptx_kernel void @test_phi(
@@ -1013,7 +922,7 @@ merge: ; preds = %second, %first
define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) {
; COMMON-LABEL: define ptx_kernel void @test_phi_write(
-; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
+; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; COMMON-NEXT: [[BB:.*:]]
; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
diff --git a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
index 82301e42f7d06..a257b6cfd77b7 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll
@@ -169,19 +169,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
; CHECK-NEXT: .b8 0 // EOM(1)
; CHECK-NEXT: .b8 0 // EOM(2)
; CHECK-NEXT: .b8 6 // Abbreviation Code
-; CHECK-NEXT: .b8 5 // DW_TAG_formal_parameter
-; CHECK-NEXT: .b8 0 // DW_CHILDREN_no
-; CHECK-NEXT: .b8 3 // DW_AT_name
-; CHECK-NEXT: .b8 8 // DW_FORM_string
-; CHECK-NEXT: .b8 58 // DW_AT_decl_file
-; CHECK-NEXT: .b8 11 // DW_FORM_data1
-; CHECK-NEXT: .b8 59 // DW_AT_decl_line
-; CHECK-NEXT: .b8 11 // DW_FORM_data1
-; CHECK-NEXT: .b8 73 // DW_AT_type
-; CHECK-NEXT: .b8 19 // DW_FORM_ref4
-; CHECK-NEXT: .b8 0 // EOM(1)
-; CHECK-NEXT: .b8 0 // EOM(2)
-; CHECK-NEXT: .b8 7 // Abbreviation Code
; CHECK-NEXT: .b8 15 // DW_TAG_pointer_type
; CHECK-NEXT: .b8 0 // DW_CHILDREN_no
; CHECK-NEXT: .b8 73 // DW_AT_type
@@ -192,12 +179,12 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
; CHECK-NEXT: }
; CHECK-NEXT: .section .debug_info
; CHECK-NEXT: {
-; CHECK-NEXT: .b32 238 // Length of Unit
+; CHECK-NEXT: .b32 254 // Length of Unit
; CHECK-NEXT: .b8 2 // DWARF version number
; CHECK-NEXT: .b8 0
; CHECK-NEXT: .b32 .debug_abbrev // Offset Into Abbrev. Section
; CHECK-NEXT: .b8 8 // Address Size (in bytes)
-; CHECK-NEXT: .b8 1 // Abbrev [1] 0xb:0xe7 DW_TAG_compile_unit
+; CHECK-NEXT: .b8 1 // Abbrev [1] 0xb:0xf7 DW_TAG_compile_unit
; CHECK-NEXT: .b8 99 // DW_AT_producer
; CHECK-NEXT: .b8 108
; CHECK-NEXT: .b8 97
@@ -307,7 +294,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
; CHECK-NEXT: .b8 9 // DW_AT_location
; CHECK-NEXT: .b8 3
; CHECK-NEXT: .b64 SHARED
-; CHECK-NEXT: .b8 4 // Abbrev [4] 0x90:0x53 DW_TAG_subprogram
+; CHECK-NEXT: .b8 4 // Abbrev [4] 0x90:0x63 DW_TAG_subprogram
; CHECK-NEXT: .b64 $L__func_begin0 // DW_AT_low_pc
; CHECK-NEXT: .b64 $L__func_end0 // DW_AT_high_pc
; CHECK-NEXT: .b8 1 // DW_AT_frame_base
@@ -337,20 +324,36 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
; CHECK-NEXT: .b8 0
; CHECK-NEXT: .b8 1 // DW_AT_decl_file
; CHECK-NEXT: .b8 6 // DW_AT_decl_line
-; CHECK-NEXT: .b32 227 // DW_AT_type
-; CHECK-NEXT: .b8 6 // Abbrev [6] 0xc0:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT: .b32 248 // DW_AT_type
+; CHECK-NEXT: .b8 5 // Abbrev [5] 0xc0:0x11 DW_TAG_formal_parameter
+; CHECK-NEXT: .b8 2 // DW_AT_address_class
+; CHECK-NEXT: .b8 6 // DW_AT_location
+; CHECK-NEXT: .b8 144
+; CHECK-NEXT: .b8 177
+; CHECK-NEXT: .b8 200
+; CHECK-NEXT: .b8 201
+; CHECK-NEXT: .b8 171
+; CHECK-NEXT: .b8 2
; CHECK-NEXT: .b8 120 // DW_AT_name
; CHECK-NEXT: .b8 0
; CHECK-NEXT: .b8 1 // DW_AT_decl_file
; CHECK-NEXT: .b8 6 // DW_AT_decl_line
-; CHECK-NEXT: .b32 236 // DW_AT_type
-; CHECK-NEXT: .b8 6 // Abbrev [6] 0xc9:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT: .b32 243 // DW_AT_type
+; CHECK-NEXT: .b8 5 // Abbrev [5] 0xd1:0x11 DW_TAG_formal_parameter
+; CHECK-NEXT: .b8 2 // DW_AT_address_class
+; CHECK-NEXT: .b8 6 // DW_AT_location
+; CHECK-NEXT: .b8 144
+; CHECK-NEXT: .b8 179
+; CHECK-NEXT: .b8 200
+; CHECK-NEXT: .b8 201
+; CHECK-NEXT: .b8 171
+; CHECK-NEXT: .b8 2
; CHECK-NEXT: .b8 121 // DW_AT_name
; CHECK-NEXT: .b8 0
; CHECK-NEXT: .b8 1 // DW_AT_decl_file
; CHECK-NEXT: .b8 6 // DW_AT_decl_line
-; CHECK-NEXT: .b32 236 // DW_AT_type
-; CHECK-NEXT: .b8 5 // Abbrev [5] 0xd2:0x10 DW_TAG_formal_parameter
+; CHECK-NEXT: .b32 243 // DW_AT_type
+; CHECK-NEXT: .b8 5 // Abbrev [5] 0xe2:0x10 DW_TAG_formal_parameter
; CHECK-NEXT: .b8 2 // DW_AT_address_class
; CHECK-NEXT: .b8 5 // DW_AT_location
; CHECK-NEXT: .b8 144
@@ -364,7 +367,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
; CHECK-NEXT: .b8 6 // DW_AT_decl_line
; CHECK-NEXT: .b32 111 // DW_AT_type
; CHECK-NEXT: .b8 0 // End Of Children Mark
-; CHECK-NEXT: .b8 3 // Abbrev [3] 0xe3:0x9 DW_TAG_base_type
+; CHECK-NEXT: .b8 6 // Abbrev [6] 0xf3:0x5 DW_TAG_pointer_type
+; CHECK-NEXT: .b32 248 // DW_AT_type
+; CHECK-NEXT: .b8 3 // Abbrev [3] 0xf8:0x9 DW_TAG_base_type
; CHECK-NEXT: .b8 102 // DW_AT_name
; CHECK-NEXT: .b8 108
; CHECK-NEXT: .b8 111
@@ -373,8 +378,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
; CHECK-NEXT: .b8 0
; CHECK-NEXT: .b8 4 // DW_AT_encoding
; CHECK-NEXT: .b8 4 // DW_AT_byte_size
-; CHECK-NEXT: .b8 7 // Abbrev [7] 0xec:0x5 DW_TAG_pointer_type
-; CHECK-NEXT: .b32 227 // DW_AT_type
; CHECK-NEXT: .b8 0 // End Of Children Mark
; CHECK-NEXT: }
; CHECK-NEXT: .section .debug_macinfo { }
diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll
index 62b30a1f15aff..fa2925af37971 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-info.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll
@@ -100,8 +100,8 @@ if.end: ; preds = %if.then, %entry
; CHECK: .section .debug_loc
; CHECK-NEXT: {
; CHECK-NEXT: $L__debug_loc0:
-; CHECK-NEXT: .b64 $L__tmp8
; CHECK-NEXT: .b64 $L__tmp10
+; CHECK-NEXT: .b64 $L__tmp12
; CHECK-NEXT: .b8 5 // Loc expr size
; CHECK-NEXT: .b8 0
; CHECK-NEXT: .b8 144 // DW_OP_regx
@@ -112,7 +112,7 @@ if.end: ; preds = %if.then, %entry
; CHECK-NEXT: .b64 0
; CHECK-NEXT: .b64 0
; CHECK-NEXT: $L__debug_loc1:
-; CHECK-NEXT: .b64 $L__tmp5
+; CHECK-NEXT: .b64 $L__tmp7
; CHECK-NEXT: .b64 $L__func_end0
; CHECK-NEXT: .b8 5 // Loc expr size
; CHECK-NEXT: .b8 0
@@ -586,12 +586,12 @@ if.end: ; preds = %if.then, %entry
; CHECK-NEXT: }
; CHECK-NEXT: .section .debug_info
; CHECK-NEXT: {
-; CHECK-NEXT: .b32 2388 // Length of Unit
+; CHECK-NEXT: .b32 2404 // Length of Unit
; CHECK-NEXT: .b8 2 // DWARF version number
; CHECK-NEXT: .b8 0
; CHECK-NEXT: .b32 .debug_abbrev // Offset Into Abbrev. Section
; CHECK-NEXT: .b8 8 // Address Size (in bytes)
-; CHECK-NEXT: .b8 1 // Abbrev [1] 0xb:0x94d DW_TAG_compile_unit
+; CHECK-NEXT: .b8 1 // Abbrev [1] 0xb:0x95d DW_TAG_compile_unit
; CHECK-NEXT: .b8 0 // DW_AT_producer
; CHECK-NEXT: .b8 4 // DW_AT_language
; CHECK-NEXT: .b8 0
@@ -2481,7 +2481,7 @@ if.end: ; preds = %if.then, %entry
; CHECK-NEXT: .b8 4 // DW_AT_byte_size
; CHECK-NEXT: .b8 12 // Abbrev [12] 0x83d:0x5 DW_TAG_pointer_type
; CHECK-NEXT: .b32 2100 // DW_AT_type
-; CHECK-NEXT: .b8 23 // Abbrev [23] 0x842:0xd5 DW_TAG_subprogram
+; CHECK-NEXT: .b8 23 // Abbrev [23] 0x842:0xe5 DW_TAG_subprogram
; CHECK-NEXT: .b64 $L__func_begin0 // DW_AT_low_pc
; CHECK-NEXT: .b64 $L__func_end0 // DW_AT_high_pc
; CHECK-NEXT: .b8 1 // DW_AT_frame_base
@@ -2522,7 +2522,7 @@ if.end: ; preds = %if.then, %entry
; CHECK-NEXT: .b8 0
; CHECK-NEXT: .b8 1 // DW_AT_decl_file
; CHECK-NEXT: .b8 5 // DW_AT_decl_line
-; CHECK-NEXT: .b32 2384 // DW_AT_type
+; CHECK-NEXT: .b32 2400 // DW_AT_type
; CHECK-NEXT: .b8 25 // Abbrev [25] 0x87d:0xd DW_TAG_formal_parameter
; CHECK-NEXT: .b32 $L__debug_loc0 // DW_AT_location
; CHECK-NEXT: .b8 97 // DW_AT_name
@@ -2530,54 +2530,70 @@ if.end: ; preds = %if.then, %entry
; CHECK-NEXT: .b8 1 // DW_AT_decl_file
; CHECK-NEXT: .b8 5 // DW_AT_decl_line
; CHECK-NEXT: .b32 2100 // DW_AT_type
-; CHECK-NEXT: .b8 22 // Abbrev [22] 0x88a:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT: .b8 24 // Abbrev [24] 0x88a:0x11 DW_TAG_formal_parameter
+; CHECK-NEXT: .b8 2 // DW_AT_address_class
+; CHECK-NEXT: .b8 6 // DW_AT_location
+; CHECK-NEXT: .b8 144
+; CHECK-NEXT: .b8 179
+; CHECK-NEXT: .b8 200
+; CHECK-NEXT: .b8 201
+; CHECK-NEXT: .b8 171
+; CHECK-NEXT: .b8 2
; CHECK-NEXT: .b8 120 // DW_AT_name
; CHECK-NEXT: .b8 0
; CHECK-NEXT: .b8 1 // DW_AT_decl_file
; CHECK-NEXT: .b8 5 // DW_AT_decl_line
; CHECK-NEXT: .b32 2109 // DW_AT_type
-; CHECK-NEXT: .b8 22 // Abbrev [22] 0x893:0x9 DW_TAG_formal_parameter
+; CHECK-NEXT: .b8 24 // Abbrev [24] 0x89b:0x11 DW_TAG_formal_parameter
+; CHECK-NEXT: .b8 2 // DW_AT_address_class
+; CHECK-NEXT: .b8 6 // DW_AT_location
+; CHECK-NEXT: .b8 144
+; CHECK-NEXT: .b8 180
+; CHECK-NEXT: .b8 200
+; CHECK-NEXT: .b8 201
+; CHECK-NEXT: .b8 171
+; CHECK-NEXT: .b8 2
; CHECK-NEXT: .b8 121 // DW_AT_name
; CHECK-NEXT: .b8 0
; CHECK-NEXT: .b8 1 // DW_AT_decl_file
; CHECK-NEXT: .b8 5 // DW_AT_decl_line
; CHECK-NEXT: .b32 2109 // DW_AT_type
-; CHECK-NEXT: .b8 26 // Abbrev [26] 0x89c:0xd DW_TAG_variable
+; CHECK-NEXT: .b8 26 // Abbrev [26] 0x8ac:0xd DW_TAG_variable
; CHECK-NEXT: .b32 $L__debug_loc1 // DW_AT_location
; CHECK-NEXT: .b8 105 // DW_AT_name
; CHECK-NEXT: .b8 0
; CHECK-NEXT: .b8 1 // DW_AT_decl_file
; CHECK-NEXT: .b8 6 // DW_AT_decl_line
-; CHECK-NEXT: .b32 2384 // DW_AT_type
-; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8a9:0x18 DW_TAG_inlined_subroutine
+; CHECK-NEXT: .b32 2400 // DW_AT_type
+; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8b9:0x18 DW_TAG_inlined_subroutine
; CHECK-NEXT: .b32 691 // DW_AT_abstract_origin
-; CHECK-NEXT: .b64 $L__tmp1 // DW_AT_low_pc
-; CHECK-NEXT: .b64 $L__tmp2 // DW_AT_high_pc
+; CHECK-NEXT: .b64 $L__tmp3 // DW_AT_low_pc
+; CHECK-NEXT: .b64 $L__tmp4 // DW_AT_high_pc
; CHECK-NEXT: .b8 1 // DW_AT_call_file
; CHECK-NEXT: .b8 6 // DW_AT_call_line
; CHECK-NEXT: .b8 11 // DW_AT_call_column
-; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8c1:0x18 DW_TAG_inlined_subroutine
+; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8d1:0x18 DW_TAG_inlined_subroutine
; CHECK-NEXT: .b32 1450 // DW_AT_abstract_origin
-; CHECK-NEXT: .b64 $L__tmp2 // DW_AT_low_pc
-; CHECK-NEXT: .b64 $L__tmp3 // DW_AT_high_pc
+; CHECK-NEXT: .b64 $L__tmp4 // DW_AT_low_pc
+; CHECK-NEXT: .b64 $L__tmp5 // DW_AT_high_pc
; CHECK-NEXT: .b8 1 // DW_AT_call_file
; CHECK-NEXT: .b8 6 // DW_AT_call_line
; CHECK-NEXT: .b8 24 // DW_AT_call_column
-; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8d9:0x18 DW_TAG_inlined_subroutine
+; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8e9:0x18 DW_TAG_inlined_subroutine
; CHECK-NEXT: .b32 2044 // DW_AT_abstract_origin
-; CHECK-NEXT: .b64 $L__tmp3 // DW_AT_low_pc
-; CHECK-NEXT: .b64 $L__tmp4 // DW_AT_high_pc
+; CHECK-NEXT: .b64 $L__tmp5 // DW_AT_low_pc
+; CHECK-NEXT: .b64 $L__tmp6 // DW_AT_high_pc
; CHECK-NEXT: .b8 1 // DW_AT_call_file
; CHECK-NEXT: .b8 6 // DW_AT_call_line
; CHECK-NEXT: .b8 37 // DW_AT_call_column
-; CHECK-NEXT: .b8 28 // Abbrev [28] 0x8f1:0x25 DW_TAG_inlined_subroutine
+; CHECK-NEXT: .b8 28 // Abbrev [28] 0x901:0x25 DW_TAG_inlined_subroutine
; CHECK-NEXT: .b32 2050 // DW_AT_abstract_origin
-; CHECK-NEXT: .b64 $L__tmp9 // DW_AT_low_pc
-; CHECK-NEXT: .b64 $L__tmp10 // DW_AT_high_pc
+; CHECK-NEXT: .b64 $L__tmp11 // DW_AT_low_pc
+; CHECK-NEXT: .b64 $L__tmp12 // DW_AT_high_pc
; CHECK-NEXT: .b8 1 // DW_AT_call_file
; CHECK-NEXT: .b8 8 // DW_AT_call_line
; CHECK-NEXT: .b8 5 // DW_AT_call_column
-; CHECK-NEXT: .b8 29 // Abbrev [29] 0x909:0xc DW_TAG_formal_parameter
+; CHECK-NEXT: .b8 29 // Abbrev [29] 0x919:0xc DW_TAG_formal_parameter
; CHECK-NEXT: .b8 2 // DW_AT_address_class
; CHECK-NEXT: .b8 5 // DW_AT_location
; CHECK-NEXT: .b8 144
@@ -2588,17 +2604,17 @@ if.end: ; preds = %if.then, %entry
; CHECK-NEXT: .b32 2079 // DW_AT_abstract_origin
; CHECK-NEXT: .b8 0 // End Of Children Mark
; CHECK-NEXT: .b8 0 // End Of Children Mark
-; CHECK-NEXT: .b8 30 // Abbrev [30] 0x917:0xd DW_TAG_namespace
+; CHECK-NEXT: .b8 30 // Abbrev [30] 0x927:0xd DW_TAG_namespace
; CHECK-NEXT: .b8 115 // DW_AT_name
; CHECK-NEXT: .b8 116
; CHECK-NEXT: .b8 100
; CHECK-NEXT: .b8 0
-; CHECK-NEXT: .b8 31 // Abbrev [31] 0x91c:0x7 DW_TAG_imported_declaration
+; CHECK-NEXT: .b8 31 // Abbrev [31] 0x92c:0x7 DW_TAG_imported_declaration
; CHECK-NEXT: .b8 4 // DW_AT_decl_file
; CHECK-NEXT: .b8 202 // DW_AT_decl_line
-; CHECK-NEXT: .b32 2340 // DW_AT_import
+; CHECK-NEXT: .b32 2356 // DW_AT_import
; CHECK-NEXT: .b8 0 // End Of Children Mark
-; CHECK-NEXT: .b8 32 // Abbrev [32] 0x924:0x1b DW_TAG_subprogram
+; CHECK-NEXT: .b8 32 // Abbrev [32] 0x934:0x1b DW_TAG_subprogram
; CHECK-NEXT: .b8 95 // DW_AT_MIPS_linkage_name
; CHECK-NEXT: .b8 90
; CHECK-NEXT: .b8 76
@@ -2614,12 +2630,12 @@ if.end: ; preds = %if.then, %entry
; CHECK-NEXT: .b8 0
; CHECK-NEXT: .b8 4 // DW_AT_decl_file
; CHECK-NEXT: .b8 44 // DW_AT_decl_line
-; CHECK-NEXT: .b32 2367 // DW_AT_type
+; CHECK-NEXT: .b32 2383 // DW_AT_type
; CHECK-NEXT: .b8 1 // DW_AT_declaration
-; CHECK-NEXT: .b8 7 // Abbrev [7] 0x939:0x5 DW_TAG_formal_parameter
-; CHECK-NEXT: .b32 2367 // DW_AT_type
+; CHECK-NEXT: .b8 7 // Abbrev [7] 0x949:0x5 DW_TAG_formal_parameter
+; CHECK-NEXT: .b32 2383 // DW_AT_type
; CHECK-NEXT: .b8 0 // End Of Children Mark
-; CHECK-NEXT: .b8 10 // Abbrev [10] 0x93f:0x11 DW_TAG_base_type
+; CHECK-NEXT: .b8 10 // Abbrev [10] 0x94f:0x11 DW_TAG_base_type
; CHECK-NEXT: .b8 108 // DW_AT_name
; CHECK-NEXT: .b8 111
; CHECK-NEXT: .b8 110
@@ -2636,7 +2652,7 @@ if.end: ; preds = %if.then, %entry
; CHECK-NEXT: .b8 0
; CHECK-NEXT: .b8 5 // DW_AT_encoding
; CHECK-NEXT: .b8 8 // DW_AT_byte_size
-; CHECK-NEXT: .b8 10 // Abbrev [10] 0x950:0x7 DW_TAG_base_type
+; CHECK-NEXT: .b8 10 // Abbrev [10] 0x960:0x7 DW_TAG_base_type
; CHECK-NEXT: .b8 105 // DW_AT_name
; CHECK-NEXT: .b8 110
; CHECK-NEXT: .b8 116
diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll
index 634e77a9c459e..dbd2662de4274 100644
--- a/llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll
@@ -7,7 +7,8 @@ target triple = "nvptx64-nvidia-cuda"
define ptx_kernel i32 @test_kernel(ptr %a, ptr byval(i32) %b) {
; CHECK-LABEL: define ptx_kernel i32 @test_kernel(
; CHECK-SAME: ptr [[A:%.*]], ptr byval(i32) [[B:%.*]]) {
-; CHECK-NEXT: [[V1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; CHECK-NEXT: [[V1:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4
; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[B]], align 4
; CHECK-NEXT: [[SUM:%.*]] = add i32 [[V1]], [[V2]]
; CHECK-NEXT: ret i32 [[SUM]]
@@ -21,8 +22,9 @@ define ptx_kernel i32 @test_kernel(ptr %a, ptr byval(i32) %b) {
define i32 @test_device(ptr %a, ptr byval(i32) %b) {
; CHECK-LABEL: define i32 @test_device(
; CHECK-SAME: ptr [[A:%.*]], ptr byval(i32) [[B:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(5)
; CHECK-NEXT: [[V1:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT: [[V2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4
; CHECK-NEXT: [[SUM:%.*]] = add i32 [[V1]], [[V2]]
; CHECK-NEXT: ret i32 [[SUM]]
;
More information about the llvm-commits
mailing list