[llvm] [NVPTXLowerArgs] Add align attribute to return value of addrspace.wrap intrinsic (PR #153889)
Drew Kersnar via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 18 09:51:19 PDT 2025
https://github.com/dakersnar updated https://github.com/llvm/llvm-project/pull/153889
>From 17713f5077d6b2fe5c855a20e26603ddc97d8c13 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Fri, 15 Aug 2025 22:12:36 +0000
Subject: [PATCH 1/3] [NVPTXLowerArgs] Add alignment attribute to return value
of addrspace wrap intrinsic
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 8 ++++
llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 16 +++++++-
.../CodeGen/NVPTX/lower-args-alignment.ll | 37 ++++++++++++++++++
.../CodeGen/NVPTX/lower-args-gridconstant.ll | 22 +++++------
llvm/test/CodeGen/NVPTX/lower-args.ll | 4 +-
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 38 +++++++++----------
6 files changed, 91 insertions(+), 34 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/lower-args-alignment.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 2445005bf98ce..6abb7a0cd1845 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1027,9 +1027,16 @@ static inline bool isAddLike(const SDValue V) {
(V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
}
+static SDValue stripAssertAlign(SDValue N) {
+ while (N.getOpcode() == ISD::AssertAlign)
+ N = N.getOperand(0);
+ return N;
+}
+
// selectBaseADDR - Match a dag node which will serve as the base address for an
// ADDR operand pair.
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
+ N = stripAssertAlign(N);
if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N))
return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N),
GA->getValueType(0), GA->getOffset(),
@@ -1044,6 +1051,7 @@ static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
}
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
+ Addr = stripAssertAlign(Addr);
APInt AccumulatedOffset(64u, 0);
while (isAddLike(Addr)) {
const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index f4362fe8d9056..a4709dfd1d95a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -515,11 +515,15 @@ void copyByValParam(Function &F, Argument &Arg) {
Arg.getParamAlign().value_or(DL.getPrefTypeAlign(StructType)));
Arg.replaceAllUsesWith(AllocA);
- Value *ArgInParam =
+ CallInst *ArgInParam =
IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
{IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},
&Arg, {}, Arg.getName());
+ if (MaybeAlign ParamAlign = Arg.getParamAlign())
+ ArgInParam->addRetAttr(
+ Attribute::getWithAlignment(ArgInParam->getContext(), *ParamAlign));
+
// Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
// addrspacecast preserves alignment. Since params are constant, this load
// is definitely not volatile.
@@ -549,10 +553,14 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
SmallVector<Use *, 16> UsesToUpdate(llvm::make_pointer_range(Arg->uses()));
IRBuilder<> IRB(&*FirstInst);
- Value *ArgInParamAS = IRB.CreateIntrinsic(
+ CallInst *ArgInParamAS = IRB.CreateIntrinsic(
Intrinsic::nvvm_internal_addrspace_wrap,
{IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()}, {Arg});
+ if (MaybeAlign ParamAlign = Arg->getParamAlign())
+ ArgInParamAS->addRetAttr(
+ Attribute::getWithAlignment(ArgInParamAS->getContext(), *ParamAlign));
+
for (Use *U : UsesToUpdate)
convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
@@ -586,6 +594,10 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
{IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()},
Arg, {}, Arg->getName() + ".param");
+ if (MaybeAlign ParamAlign = Arg->getParamAlign())
+ ParamSpaceArg->addRetAttr(
+ Attribute::getWithAlignment(ParamSpaceArg->getContext(), *ParamAlign));
+
// Cast param address to generic address space.
Value *GenericArg = IRB.CreateAddrSpaceCast(
ParamSpaceArg, IRB.getPtrTy(ADDRESS_SPACE_GENERIC),
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-alignment.ll b/llvm/test/CodeGen/NVPTX/lower-args-alignment.ll
new file mode 100644
index 0000000000000..cd96cdeea4e6b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/lower-args-alignment.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=nvptx-lower-args,infer-alignment -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; ------------------------------------------------------------------------------
+; Test that alignment can be inferred through llvm.nvvm.internal.addrspace.wrap.p101.p0 intrinsics
+; thanks to the alignment attribute on the intrinsic
+; ------------------------------------------------------------------------------
+
+%struct.S1 = type { i32, i32, i32, i32 }
+define ptx_kernel i32 @test_align8(ptr noundef readonly byval(%struct.S1) align 8 captures(none) %params) {
+; CHECK-LABEL: define ptx_kernel i32 @test_align8(
+; CHECK-SAME: ptr noundef readonly byval([[STRUCT_S1:%.*]]) align 8 captures(none) [[PARAMS:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = call align 8 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[PARAMS]])
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(101) [[TMP0]], align 8
+; CHECK-NEXT: ret i32 [[LOAD]]
+;
+entry:
+ %load = load i32, ptr %params, align 4
+ ret i32 %load
+}
+
+define ptx_kernel i32 @test_align1(ptr noundef readonly byval(%struct.S1) align 1 captures(none) %params) {
+; CHECK-LABEL: define ptx_kernel i32 @test_align1(
+; CHECK-SAME: ptr noundef readonly byval([[STRUCT_S1:%.*]]) align 4 captures(none) [[PARAMS:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = call align 1 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[PARAMS]])
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(101) [[TMP0]], align 4
+; CHECK-NEXT: ret i32 [[LOAD]]
+;
+entry:
+ %load = load i32, ptr %params, align 4
+ ret i32 %load
+}
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 045704bdcd3fc..f5df0fcde1883 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -72,7 +72,7 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_int(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT11:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT: [[INPUT11:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4
@@ -101,7 +101,7 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_struct(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
@@ -137,7 +137,7 @@ define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
; OPT-NEXT: ret void
@@ -180,9 +180,9 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]])
+; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]])
; OPT-NEXT: [[B_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
-; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
@@ -208,7 +208,7 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR]], align 8
; OPT-NEXT: ret void
@@ -235,7 +235,7 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4
; PTX-NOT .local
; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
@@ -357,7 +357,7 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_phi(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
@@ -416,7 +416,7 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1,
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
-; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
@@ -471,7 +471,7 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
-; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
@@ -520,7 +520,7 @@ declare void @device_func(ptr byval(i32) align 4)
define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
; OPT-LABEL: define ptx_kernel void @test_forward_byval_arg(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT_PARAM:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[INPUT_PARAM]] to ptr
; OPT-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT_PARAM_GEN]])
; OPT-NEXT: ret void
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index 7c029ab516d6e..b4a51035c6610 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -200,7 +200,7 @@ define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) {
; IRC-LABEL: define ptx_kernel void @ptr_as_int_aggr(
; IRC-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
-; IRC-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; IRC-NEXT: [[S3:%.*]] = call align 8 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; IRC-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S3]], align 8
; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
@@ -210,7 +210,7 @@ define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%st
;
; IRO-LABEL: define ptx_kernel void @ptr_as_int_aggr(
; IRO-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
-; IRO-NEXT: [[S1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; IRO-NEXT: [[S1:%.*]] = call align 8 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; IRO-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S1]], align 8
; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 20a35198c3c16..4d36ff9496ede 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -32,7 +32,7 @@ define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; LOWER-ARGS-NEXT: [[S3:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; LOWER-ARGS-NEXT: ret void
@@ -66,7 +66,7 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; LOWER-ARGS-NEXT: [[S3:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4
@@ -128,7 +128,7 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out
; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]]
; COMMON-NEXT: ret void
@@ -167,7 +167,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone
; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]]
@@ -209,7 +209,7 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: store ptr [[S1]], ptr [[OUT]], align 8
; COMMON-NEXT: ret void
@@ -246,7 +246,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
; COMMON-NEXT: store ptr [[B]], ptr [[OUT]], align 8
@@ -286,7 +286,7 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64
; COMMON-NEXT: store i64 [[I]], ptr [[OUT]], align 8
@@ -324,7 +324,7 @@ define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeo
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; LOWER-ARGS-NEXT: [[S3:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true)
; LOWER-ARGS-NEXT: ret void
;
@@ -445,7 +445,7 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly
; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true)
; COMMON-NEXT: ret void
@@ -525,7 +525,7 @@ define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly %
; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[BB:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4
; COMMON-NEXT: store i32 [[I]], ptr [[S1]], align 4
@@ -551,7 +551,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; SM_60-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false)
; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
-; SM_60-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; SM_60-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
@@ -563,7 +563,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; SM_70-NEXT: [[BB:.*:]]
; SM_70-NEXT: [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr
-; SM_70-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; SM_70-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
@@ -577,7 +577,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; COPY-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
; COPY-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
-; COPY-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; COPY-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; COPY-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
; COPY-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
@@ -637,7 +637,7 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by
; COMMON-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
-; COMMON-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; COMMON-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4
@@ -682,7 +682,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; SM_60-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; SM_60-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; SM_60: [[FIRST]]:
@@ -702,7 +702,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; SM_70-NEXT: [[BB:.*:]]
; SM_70-NEXT: [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr
-; SM_70-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; SM_70-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; SM_70-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; SM_70: [[FIRST]]:
@@ -724,7 +724,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; COPY-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT23]], ptr addrspace(101) align 8 [[INPUT24]], i64 8, i1 false)
; COPY-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; COPY-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; COPY-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; COPY: [[FIRST]]:
@@ -808,7 +808,7 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr
; COMMON-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; COMMON-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; COMMON: [[FIRST]]:
@@ -871,7 +871,7 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
; COMMON-LABEL: define ptx_kernel void @test_forward_byval_arg(
; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
; COMMON-NEXT: [[INPUT1:%.*]] = alloca i32, align 4
-; COMMON-NEXT: [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; COMMON-NEXT: [[INPUT2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false)
; COMMON-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT1]])
; COMMON-NEXT: ret void
>From 00b267acd428cc8bfee0dd6f1d377b50aa6bfaff Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Fri, 15 Aug 2025 22:33:09 +0000
Subject: [PATCH 2/3] Clang format
---
llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index a4709dfd1d95a..cf2d4a51c48e8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -595,8 +595,8 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
Arg, {}, Arg->getName() + ".param");
if (MaybeAlign ParamAlign = Arg->getParamAlign())
- ParamSpaceArg->addRetAttr(
- Attribute::getWithAlignment(ParamSpaceArg->getContext(), *ParamAlign));
+ ParamSpaceArg->addRetAttr(Attribute::getWithAlignment(
+ ParamSpaceArg->getContext(), *ParamAlign));
// Cast param address to generic address space.
Value *GenericArg = IRB.CreateAddrSpaceCast(
>From f1553756a727a170251407bdb472f8038ada1aa9 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Mon, 18 Aug 2025 16:50:56 +0000
Subject: [PATCH 3/3] Address reviewer feedback
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 5 +--
llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 43 +++++++++----------
.../CodeGen/NVPTX/lower-args-alignment.ll | 1 -
3 files changed, 22 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 6abb7a0cd1845..7c4456b51f9b9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1028,7 +1028,7 @@ static inline bool isAddLike(const SDValue V) {
}
static SDValue stripAssertAlign(SDValue N) {
- while (N.getOpcode() == ISD::AssertAlign)
+ if (N.getOpcode() == ISD::AssertAlign)
N = N.getOperand(0);
return N;
}
@@ -1051,7 +1051,6 @@ static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
}
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
- Addr = stripAssertAlign(Addr);
APInt AccumulatedOffset(64u, 0);
while (isAddLike(Addr)) {
const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
@@ -1063,7 +1062,7 @@ static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
break;
AccumulatedOffset += CI;
- Addr = Addr->getOperand(0);
+ Addr = stripAssertAlign(Addr->getOperand(0));
}
return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL,
MVT::i32);
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index cf2d4a51c48e8..2311922e63e69 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -412,6 +412,23 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
}
}
+// Create a call to the nvvm_internal_addrspace_wrap intrinsic and set the
+// alignment of the return value based on the alignment of the argument.
+static CallInst *createNVVMInternalAddrspaceWrap(IRBuilder<> &IRB,
+ Argument &Arg,
+ const Twine &Name = "") {
+ CallInst *ArgInParam =
+ IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
+ {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},
+ &Arg, {}, Name);
+
+ if (MaybeAlign ParamAlign = Arg.getParamAlign())
+ ArgInParam->addRetAttr(
+ Attribute::getWithAlignment(ArgInParam->getContext(), *ParamAlign));
+
+ return ArgInParam;
+}
+
namespace {
struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
using Base = PtrUseVisitor<ArgUseChecker>;
@@ -515,14 +532,7 @@ void copyByValParam(Function &F, Argument &Arg) {
Arg.getParamAlign().value_or(DL.getPrefTypeAlign(StructType)));
Arg.replaceAllUsesWith(AllocA);
- CallInst *ArgInParam =
- IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
- {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},
- &Arg, {}, Arg.getName());
-
- if (MaybeAlign ParamAlign = Arg.getParamAlign())
- ArgInParam->addRetAttr(
- Attribute::getWithAlignment(ArgInParam->getContext(), *ParamAlign));
+ CallInst *ArgInParam = createNVVMInternalAddrspaceWrap(IRB, Arg, Arg.getName());
// Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
// addrspacecast preserves alignment. Since params are constant, this load
@@ -553,13 +563,7 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
SmallVector<Use *, 16> UsesToUpdate(llvm::make_pointer_range(Arg->uses()));
IRBuilder<> IRB(&*FirstInst);
- CallInst *ArgInParamAS = IRB.CreateIntrinsic(
- Intrinsic::nvvm_internal_addrspace_wrap,
- {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()}, {Arg});
-
- if (MaybeAlign ParamAlign = Arg->getParamAlign())
- ArgInParamAS->addRetAttr(
- Attribute::getWithAlignment(ArgInParamAS->getContext(), *ParamAlign));
+ CallInst *ArgInParamAS = createNVVMInternalAddrspaceWrap(IRB, *Arg);
for (Use *U : UsesToUpdate)
convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
@@ -589,14 +593,7 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
// argument already in the param address space, we need to use the noop
// intrinsic, this had the added benefit of preventing other optimizations
// from folding away this pair of addrspacecasts.
- auto *ParamSpaceArg =
- IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
- {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()},
- Arg, {}, Arg->getName() + ".param");
-
- if (MaybeAlign ParamAlign = Arg->getParamAlign())
- ParamSpaceArg->addRetAttr(Attribute::getWithAlignment(
- ParamSpaceArg->getContext(), *ParamAlign));
+ auto *ParamSpaceArg = createNVVMInternalAddrspaceWrap(IRB, *Arg, Arg->getName() + ".param");
// Cast param address to generic address space.
Value *GenericArg = IRB.CreateAddrSpaceCast(
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-alignment.ll b/llvm/test/CodeGen/NVPTX/lower-args-alignment.ll
index cd96cdeea4e6b..2051f6305cc03 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-alignment.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-alignment.ll
@@ -1,7 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -passes=nvptx-lower-args,infer-alignment -S | FileCheck %s
-target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
; ------------------------------------------------------------------------------
More information about the llvm-commits
mailing list