[llvm] [NVPTX] Improve kernel byval parameter lowering (PR #136008)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 16 13:52:55 PDT 2025
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/136008
>From e3cd182fdf03385cfe2c49a18425a8e2e84a3313 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Sun, 6 Apr 2025 21:09:52 +0000
Subject: [PATCH 1/2] pre-commit tests
---
.../CodeGen/NVPTX/lower-args-gridconstant.ll | 37 +++++++++-
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 71 +++++++++++++++++++
2 files changed, 107 insertions(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 836a7d78a0cc5..5ddd5a4846ad8 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -560,13 +560,46 @@ define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
ret i32 %keepalive
}
+declare void @device_func(ptr byval(i32) align 4)
+
+define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
+; OPT-LABEL: define ptx_kernel void @test_forward_byval_arg(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT_PARAM_GEN]])
+; OPT-NEXT: ret void
+;
+; PTX-LABEL: test_forward_byval_arg(
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<2>;
+; PTX-NEXT: .reg .b64 %rd<4>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, test_forward_byval_arg_param_0;
+; PTX-NEXT: mov.b64 %rd2, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
+; PTX-NEXT: ld.u32 %r1, [%rd3];
+; PTX-NEXT: { // callseq 4, 0
+; PTX-NEXT: .param .align 4 .b8 param0[4];
+; PTX-NEXT: st.param.b32 [param0], %r1;
+; PTX-NEXT: call.uni
+; PTX-NEXT: device_func,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: } // callseq 4
+; PTX-NEXT: ret;
+ call void @device_func(ptr byval(i32) align 4 %input)
+ ret void
+}
declare dso_local void @dummy() local_unnamed_addr
declare dso_local ptr @escape(ptr) local_unnamed_addr
declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
-!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23}
+!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24}
!0 = !{ptr @grid_const_int, !"grid_constant", !1}
!1 = !{i32 1}
@@ -604,4 +637,6 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
!22 = !{ptr @grid_const_ptrtoint, !"grid_constant", !23}
!23 = !{i32 1}
+!24 = !{ptr @test_forward_byval_arg, !"grid_constant", !25}
+!25 = !{i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 2d8684c7cab48..698c37d3c70e2 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -988,6 +988,77 @@ merge: ; preds = %second, %first
ret void
}
+define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
+; COMMON-LABEL: define ptx_kernel void @test_forward_byval_arg(
+; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
+; COMMON-NEXT: [[INPUT1:%.*]] = alloca i32, align 4
+; COMMON-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false)
+; COMMON-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT1]])
+; COMMON-NEXT: ret void
+;
+; PTX-LABEL: test_forward_byval_arg(
+; PTX: {
+; PTX-NEXT: .local .align 4 .b8 __local_depot17[4];
+; PTX-NEXT: .reg .b64 %SP;
+; PTX-NEXT: .reg .b64 %SPL;
+; PTX-NEXT: .reg .b32 %r<2>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %SPL, __local_depot17;
+; PTX-NEXT: add.u64 %rd2, %SPL, 0;
+; PTX-NEXT: ld.param.u32 %r1, [test_forward_byval_arg_param_0];
+; PTX-NEXT: st.local.u32 [%rd2], %r1;
+; PTX-NEXT: { // callseq 2, 0
+; PTX-NEXT: .param .align 4 .b8 param0[4];
+; PTX-NEXT: st.param.b32 [param0], %r1;
+; PTX-NEXT: call.uni
+; PTX-NEXT: device_func,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: } // callseq 2
+; PTX-NEXT: ret;
+ call void @device_func(ptr byval(i32) align 4 %input)
+ ret void
+}
+
+define void @device_func(ptr byval(i32) align 4 %input) {
+; LOWER-ARGS-LABEL: define void @device_func(
+; LOWER-ARGS-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
+; LOWER-ARGS-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT]])
+; LOWER-ARGS-NEXT: ret void
+;
+; COPY-LABEL: define void @device_func(
+; COPY-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
+; COPY-NEXT: [[INPUT1:%.*]] = alloca i32, align 4
+; COPY-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false)
+; COPY-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT1]])
+; COPY-NEXT: ret void
+;
+; PTX-LABEL: device_func(
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<2>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u32 %r1, [device_func_param_0];
+; PTX-NEXT: { // callseq 3, 0
+; PTX-NEXT: .param .align 4 .b8 param0[4];
+; PTX-NEXT: st.param.b32 [param0], %r1;
+; PTX-NEXT: call.uni
+; PTX-NEXT: device_func,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: } // callseq 3
+; PTX-NEXT: ret;
+ call void @device_func(ptr byval(i32) align 4 %input)
+ ret void
+}
+
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
>From 106b0a070812f7b896dd85eef149f4b111970dc8 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Sun, 6 Apr 2025 20:48:18 +0000
Subject: [PATCH 2/2] [NVPTX] Improve kernel byval parameter lowering
---
llvm/include/llvm/IR/IntrinsicsNVVM.td | 15 ++
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 5 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 24 ++
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 16 +-
llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 89 +++---
llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 33 ++-
llvm/lib/Target/NVPTX/NVPTXUtilities.h | 2 +-
llvm/test/CodeGen/NVPTX/bug21465.ll | 2 +-
llvm/test/CodeGen/NVPTX/forward-ld-param.ll | 2 +-
.../CodeGen/NVPTX/lower-args-gridconstant.ll | 178 ++++++------
llvm/test/CodeGen/NVPTX/lower-args.ll | 4 +-
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 255 +++++++-----------
.../Inputs/nvptx-basic.ll.expected | 12 +-
13 files changed, 307 insertions(+), 330 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 4aeb1d8a2779e..5d89b0ae2b484 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1878,6 +1878,21 @@ def int_nvvm_ptr_param_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
[IntrNoMem, IntrSpeculatable, IntrNoCallback],
"llvm.nvvm.ptr.param.to.gen">;
+// Represents an explicit hole in the LLVM IR type system. It may be inserted by
+// the compiler in cases where a pointer is of the wrong type. In the backend
+// this intrinsic will be folded away and not equate to any instruction. It
+// should not be used by any frontend and should only be considered well defined
+// when added in the following cases:
+//
+// - NVPTXLowerArgs: When wrapping a byval pointer argument to a kernel
+// function to convert the address space from generic (0) to param (101).
+// This accounts for the fact that the parameter symbols will occupy this
+// space when lowered during ISel.
+//
+def int_nvvm_internal_noop_addrspacecast :
+ DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty],
+ [IntrNoMem, IntrSpeculatable, NoUndef<ArgIndex<0>>, NoUndef<RetIndex>]>;
+
// Move intrinsics, used in nvvm internally
def int_nvvm_move_i16 : Intrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem],
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ec1f969494cd1..486c7c815435a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -985,6 +985,9 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
case ADDRESS_SPACE_LOCAL:
Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
break;
+ case ADDRESS_SPACE_PARAM:
+ Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
+ break;
}
ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src));
return;
@@ -1008,7 +1011,7 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
break;
case ADDRESS_SPACE_PARAM:
- Opc = TM.is64Bit() ? NVPTX::IMOV64r : NVPTX::IMOV32r;
+ Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
break;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 9bde2a976e164..166785a79ec4c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1017,6 +1017,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
{MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
MVT::v32i32, MVT::v64i32, MVT::v128i32},
Custom);
+
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
}
const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -1434,6 +1436,17 @@ static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG,
return MachinePointerInfo(ADDRESS_SPACE_LOCAL);
}
+
+ // Peel of an addrspacecast to generic and load directly from the specific
+ // address space.
+ if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {
+ const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);
+ if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {
+ Ptr = ASC->getOperand(0);
+ return MachinePointerInfo(ASC->getSrcAddressSpace());
+ }
+ }
+
return MachinePointerInfo();
}
@@ -2754,6 +2767,15 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {
return Op;
}
+static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) {
+ switch (Op->getConstantOperandVal(0)) {
+ default:
+ return Op;
+ case Intrinsic::nvvm_internal_noop_addrspacecast:
+ return Op.getOperand(1);
+ }
+}
+
// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
// Lower these into a node returning the correct type which is zero-extended
// back to the correct size.
@@ -2863,6 +2885,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerGlobalAddress(Op, DAG);
case ISD::INTRINSIC_W_CHAIN:
return Op;
+ case ISD::INTRINSIC_WO_CHAIN:
+ return lowerIntrinsicWOChain(Op, DAG);
case ISD::INTRINSIC_VOID:
return LowerIntrinsicVoid(Op, DAG);
case ISD::BUILD_VECTOR:
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 8528ff702f236..266f379607690 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2335,18 +2335,10 @@ multiclass G_TO_NG<string Str> {
"cvta.to." # Str # ".u64 \t$result, $src;", []>;
}
-defm cvta_local : NG_TO_G<"local">;
-defm cvta_shared : NG_TO_G<"shared">;
-defm cvta_global : NG_TO_G<"global">;
-defm cvta_const : NG_TO_G<"const">;
-
-defm cvta_to_local : G_TO_NG<"local">;
-defm cvta_to_shared : G_TO_NG<"shared">;
-defm cvta_to_global : G_TO_NG<"global">;
-defm cvta_to_const : G_TO_NG<"const">;
-
-// nvvm.ptr.param.to.gen
-defm cvta_param : NG_TO_G<"param">;
+foreach space = ["local", "shared", "global", "const", "param"] in {
+ defm cvta_#space : NG_TO_G<space>;
+ defm cvta_to_#space : G_TO_NG<space>;
+}
def : Pat<(int_nvvm_ptr_param_to_gen i32:$src),
(cvta_param $src)>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 6452fa05947dd..770914fcc2f28 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -265,18 +265,9 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam,
if (HasCvtaParam) {
auto GetParamAddrCastToGeneric =
[](Value *Addr, Instruction *OriginalUser) -> Value * {
- PointerType *ReturnTy =
- PointerType::get(OriginalUser->getContext(), ADDRESS_SPACE_GENERIC);
- Function *CvtToGen = Intrinsic::getOrInsertDeclaration(
- OriginalUser->getModule(), Intrinsic::nvvm_ptr_param_to_gen,
- {ReturnTy, PointerType::get(OriginalUser->getContext(),
- ADDRESS_SPACE_PARAM)});
-
- // Cast param address to generic address space
- Value *CvtToGenCall =
- CallInst::Create(CvtToGen, Addr, Addr->getName() + ".gen",
- OriginalUser->getIterator());
- return CvtToGenCall;
+ IRBuilder<> IRB(OriginalUser);
+ Type *GenTy = IRB.getPtrTy(ADDRESS_SPACE_GENERIC);
+ return IRB.CreateAddrSpaceCast(Addr, GenTy, Addr->getName() + ".gen");
};
auto *ParamInGenericAS =
GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction);
@@ -515,23 +506,24 @@ void copyByValParam(Function &F, Argument &Arg) {
BasicBlock::iterator FirstInst = F.getEntryBlock().begin();
Type *StructType = Arg.getParamByValType();
const DataLayout &DL = F.getDataLayout();
- AllocaInst *AllocA = new AllocaInst(StructType, DL.getAllocaAddrSpace(),
- Arg.getName(), FirstInst);
+ IRBuilder<> IRB(&*FirstInst);
+ AllocaInst *AllocA = IRB.CreateAlloca(StructType, nullptr, Arg.getName());
// Set the alignment to alignment of the byval parameter. This is because,
// later load/stores assume that alignment, and we are going to replace
// the use of the byval parameter with this alloca instruction.
- AllocA->setAlignment(F.getParamAlign(Arg.getArgNo())
- .value_or(DL.getPrefTypeAlign(StructType)));
+ AllocA->setAlignment(
+ Arg.getParamAlign().value_or(DL.getPrefTypeAlign(StructType)));
Arg.replaceAllUsesWith(AllocA);
- Value *ArgInParam = new AddrSpaceCastInst(
- &Arg, PointerType::get(Arg.getContext(), ADDRESS_SPACE_PARAM),
- Arg.getName(), FirstInst);
+ Value *ArgInParam =
+ IRB.CreateIntrinsic(Intrinsic::nvvm_internal_noop_addrspacecast,
+ {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},
+ &Arg, {}, Arg.getName());
+
// Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
// addrspacecast preserves alignment. Since params are constant, this load
// is definitely not volatile.
const auto ArgSize = *AllocA->getAllocationSize(DL);
- IRBuilder<> IRB(&*FirstInst);
IRB.CreateMemCpy(AllocA, AllocA->getAlign(), ArgInParam, AllocA->getAlign(),
ArgSize);
}
@@ -539,9 +531,9 @@ void copyByValParam(Function &F, Argument &Arg) {
static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
Function *Func = Arg->getParent();
- bool HasCvtaParam =
- TM.getSubtargetImpl(*Func)->hasCvtaParam() && isKernelFunction(*Func);
- bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);
+ assert(isKernelFunction(*Func));
+ const bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam();
+ const bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);
const DataLayout &DL = Func->getDataLayout();
BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
Type *StructType = Arg->getParamByValType();
@@ -558,9 +550,11 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
for (Use &U : Arg->uses())
UsesToUpdate.push_back(&U);
- Value *ArgInParamAS = new AddrSpaceCastInst(
- Arg, PointerType::get(StructType->getContext(), ADDRESS_SPACE_PARAM),
- Arg->getName(), FirstInst);
+ IRBuilder<> IRB(&*FirstInst);
+ Value *ArgInParamAS = IRB.CreateIntrinsic(
+ Intrinsic::nvvm_internal_noop_addrspacecast,
+ {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()}, {Arg});
+
for (Use *U : UsesToUpdate)
convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
@@ -578,30 +572,31 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
// However, we're still not allowed to write to it. If the user specified
// `__grid_constant__` for the argument, we'll consider escaped pointer as
// read-only.
- if (HasCvtaParam && (ArgUseIsReadOnly || IsGridConstant)) {
+ if (IsGridConstant || (HasCvtaParam && ArgUseIsReadOnly)) {
LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n");
// Replace all argument pointer uses (which might include a device function
// call) with a cast to the generic address space using cvta.param
// instruction, which avoids a local copy.
IRBuilder<> IRB(&Func->getEntryBlock().front());
- // Cast argument to param address space
- auto *CastToParam = cast<AddrSpaceCastInst>(IRB.CreateAddrSpaceCast(
- Arg, IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getName() + ".param"));
+ // Cast argument to param address space. Because the backend will emit the
+ // argument already in the param address space, we need to use the noop
+ // intrinsic, this had the added benefit of preventing other optimizations
+ // from folding away this pair of addrspacecasts.
+ auto *ParamSpaceArg =
+ IRB.CreateIntrinsic(Intrinsic::nvvm_internal_noop_addrspacecast,
+ {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()},
+ Arg, {}, Arg->getName() + ".param");
- // Cast param address to generic address space. We do not use an
- // addrspacecast to generic here, because, LLVM considers `Arg` to be in the
- // generic address space, and a `generic -> param` cast followed by a `param
- // -> generic` cast will be folded away. The `param -> generic` intrinsic
- // will be correctly lowered to `cvta.param`.
- Value *CvtToGenCall = IRB.CreateIntrinsic(
- IRB.getPtrTy(ADDRESS_SPACE_GENERIC), Intrinsic::nvvm_ptr_param_to_gen,
- CastToParam, nullptr, CastToParam->getName() + ".gen");
+ // Cast param address to generic address space.
+ Value *GenericArg = IRB.CreateAddrSpaceCast(
+ ParamSpaceArg, IRB.getPtrTy(ADDRESS_SPACE_GENERIC),
+ Arg->getName() + ".gen");
- Arg->replaceAllUsesWith(CvtToGenCall);
+ Arg->replaceAllUsesWith(GenericArg);
// Do not replace Arg in the cast to param space
- CastToParam->setOperand(0, Arg);
+ ParamSpaceArg->setOperand(0, Arg);
} else
copyByValParam(*Func, *Arg);
}
@@ -715,12 +710,14 @@ static bool copyFunctionByValArgs(Function &F) {
LLVM_DEBUG(dbgs() << "Creating a copy of byval args of " << F.getName()
<< "\n");
bool Changed = false;
- for (Argument &Arg : F.args())
- if (Arg.getType()->isPointerTy() && Arg.hasByValAttr() &&
- !(isParamGridConstant(Arg) && isKernelFunction(F))) {
- copyByValParam(F, Arg);
- Changed = true;
- }
+ if (isKernelFunction(F)) {
+ for (Argument &Arg : F.args())
+ if (Arg.getType()->isPointerTy() && Arg.hasByValAttr() &&
+ !isParamGridConstant(Arg)) {
+ copyByValParam(F, Arg);
+ Changed = true;
+ }
+ }
return Changed;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 3d9d2ae372080..0cbebc6995c9a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -16,11 +16,13 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Argument.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Alignment.h"
+#include "llvm/Support/ModRef.h"
#include "llvm/Support/Mutex.h"
#include <cstdint>
#include <cstring>
@@ -228,17 +230,30 @@ static std::optional<uint64_t> getVectorProduct(ArrayRef<unsigned> V) {
return std::accumulate(V.begin(), V.end(), 1, std::multiplies<uint64_t>{});
}
-bool isParamGridConstant(const Value &V) {
- if (const Argument *Arg = dyn_cast<Argument>(&V)) {
- // "grid_constant" counts argument indices starting from 1
- if (Arg->hasByValAttr() &&
- argHasNVVMAnnotation(*Arg, "grid_constant",
- /*StartArgIndexAtOne*/ true)) {
- assert(isKernelFunction(*Arg->getParent()) &&
- "only kernel arguments can be grid_constant");
+bool isParamGridConstant(const Argument &Arg) {
+ assert(isKernelFunction(*Arg.getParent()) &&
+ "only kernel arguments can be grid_constant");
+
+ if (!Arg.hasByValAttr())
+ return false;
+
+ // Lowering an argument as a grid_constant violates the byval semantics (and
+ // the C++ API) by reusing the same memory location for the argument across
+ // multiple threads. If an argument doesn't read memory and its address is not
+ // captured (its address is not compared with any value), then the tweak of
+ // the C++ API and byval semantics is unobservable by the program and we can
+ // lower the arg as a grid_constant.
+ if (Arg.onlyReadsMemory()) {
+ const auto CI = Arg.getAttributes().getCaptureInfo();
+ if (!capturesAddress(CI) && !capturesFullProvenance(CI))
return true;
- }
}
+
+ // "grid_constant" counts argument indices starting from 1
+ if (argHasNVVMAnnotation(Arg, "grid_constant",
+ /*StartArgIndexAtOne*/ true))
+ return true;
+
return false;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 9283b398a9c14..9adbb645deb4a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -63,7 +63,7 @@ inline bool isKernelFunction(const Function &F) {
return F.getCallingConv() == CallingConv::PTX_Kernel;
}
-bool isParamGridConstant(const Value &);
+bool isParamGridConstant(const Argument &);
inline MaybeAlign getAlign(const Function &F, unsigned Index) {
return F.getAttributes().getAttributes(Index).getStackAlignment();
diff --git a/llvm/test/CodeGen/NVPTX/bug21465.ll b/llvm/test/CodeGen/NVPTX/bug21465.ll
index 76300e3cfdc5b..21ec05c70e3ad 100644
--- a/llvm/test/CodeGen/NVPTX/bug21465.ll
+++ b/llvm/test/CodeGen/NVPTX/bug21465.ll
@@ -12,7 +12,7 @@ define ptx_kernel void @_Z11TakesStruct1SPi(ptr byval(%struct.S) nocapture reado
entry:
; CHECK-LABEL: @_Z11TakesStruct1SPi
; PTX-LABEL: .visible .entry _Z11TakesStruct1SPi(
-; CHECK: addrspacecast ptr %input to ptr addrspace(101)
+; CHECK: call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr %input)
%b = getelementptr inbounds %struct.S, ptr %input, i64 0, i32 1
%0 = load i32, ptr %b, align 4
; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}]
diff --git a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
index 6d9710e6d2272..80ae8aac39115 100644
--- a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
+++ b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
@@ -65,7 +65,7 @@ define void @test_ld_param_byval(ptr byval(i32) %a) {
; CHECK-LABEL: test_ld_param_byval(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<2>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_ld_param_byval_param_0];
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 5ddd5a4846ad8..46535a7a91c28 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -72,7 +72,7 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_int(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT11:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4
@@ -101,7 +101,7 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_struct(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
@@ -122,16 +122,15 @@ define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
; PTX-LABEL: grid_const_escape(
; PTX: {
; PTX-NEXT: .reg .b32 %r<3>;
-; PTX-NEXT: .reg .b64 %rd<5>;
+; PTX-NEXT: .reg .b64 %rd<4>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd2, grid_const_escape_param_0;
-; PTX-NEXT: mov.b64 %rd3, %rd2;
-; PTX-NEXT: cvta.param.u64 %rd4, %rd3;
+; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
; PTX-NEXT: mov.b64 %rd1, escape;
; PTX-NEXT: { // callseq 0, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd4;
+; PTX-NEXT: st.param.b64 [param0], %rd3;
; PTX-NEXT: .param .b32 retval0;
; PTX-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _);
; PTX-NEXT: call (retval0),
@@ -145,8 +144,8 @@ define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
; OPT-NEXT: ret void
%call = call i32 @escape(ptr %input)
@@ -160,29 +159,27 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .b32 %r<4>;
-; PTX-NEXT: .reg .b64 %rd<10>;
+; PTX-NEXT: .reg .b64 %rd<8>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %SPL, __local_depot4;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_0;
-; PTX-NEXT: mov.b64 %rd3, multiple_grid_const_escape_param_2;
-; PTX-NEXT: mov.b64 %rd4, %rd3;
; PTX-NEXT: ld.param.u32 %r1, [multiple_grid_const_escape_param_1];
-; PTX-NEXT: cvta.param.u64 %rd5, %rd4;
-; PTX-NEXT: mov.b64 %rd6, %rd2;
-; PTX-NEXT: cvta.param.u64 %rd7, %rd6;
-; PTX-NEXT: add.u64 %rd8, %SP, 0;
-; PTX-NEXT: add.u64 %rd9, %SPL, 0;
-; PTX-NEXT: st.local.u32 [%rd9], %r1;
+; PTX-NEXT: mov.b64 %rd3, multiple_grid_const_escape_param_2;
+; PTX-NEXT: cvta.param.u64 %rd4, %rd3;
+; PTX-NEXT: cvta.param.u64 %rd5, %rd2;
+; PTX-NEXT: add.u64 %rd6, %SP, 0;
+; PTX-NEXT: add.u64 %rd7, %SPL, 0;
+; PTX-NEXT: st.local.u32 [%rd7], %r1;
; PTX-NEXT: mov.b64 %rd1, escape3;
; PTX-NEXT: { // callseq 1, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd7;
+; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: .param .b64 param1;
-; PTX-NEXT: st.param.b64 [param1], %rd8;
+; PTX-NEXT: st.param.b64 [param1], %rd6;
; PTX-NEXT: .param .b64 param2;
-; PTX-NEXT: st.param.b64 [param2], %rd5;
+; PTX-NEXT: st.param.b64 [param2], %rd4;
; PTX-NEXT: .param .b32 retval0;
; PTX-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _);
; PTX-NEXT: call (retval0),
@@ -198,10 +195,10 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; OPT-NEXT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
-; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[B]])
+; OPT-NEXT: [[B_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
+; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
@@ -215,20 +212,19 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
; PTX-LABEL: grid_const_memory_escape(
; PTX: {
-; PTX-NEXT: .reg .b64 %rd<6>;
+; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, grid_const_memory_escape_param_0;
; PTX-NEXT: ld.param.u64 %rd2, [grid_const_memory_escape_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
-; PTX-NEXT: mov.b64 %rd4, %rd1;
-; PTX-NEXT: cvta.param.u64 %rd5, %rd4;
-; PTX-NEXT: st.global.u64 [%rd3], %rd5;
+; PTX-NEXT: cvta.param.u64 %rd4, %rd1;
+; PTX-NEXT: st.global.u64 [%rd3], %rd4;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR]], align 8
; OPT-NEXT: ret void
store ptr %input, ptr %addr, align 8
@@ -238,14 +234,13 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i
define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
; PTX-LABEL: grid_const_inlineasm_escape(
; PTX: {
-; PTX-NEXT: .reg .b64 %rd<8>;
+; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd4, grid_const_inlineasm_escape_param_0;
; PTX-NEXT: ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd6, %rd5;
-; PTX-NEXT: mov.b64 %rd7, %rd4;
-; PTX-NEXT: cvta.param.u64 %rd2, %rd7;
+; PTX-NEXT: cvta.param.u64 %rd2, %rd4;
; PTX-NEXT: add.s64 %rd3, %rd2, 4;
; PTX-NEXT: // begin inline asm
; PTX-NEXT: add.s64 %rd1, %rd2, %rd3;
@@ -255,8 +250,8 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4
; PTX-NOT .local
; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
@@ -273,21 +268,20 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou
; PTX-LABEL: grid_const_partial_escape(
; PTX: {
; PTX-NEXT: .reg .b32 %r<5>;
-; PTX-NEXT: .reg .b64 %rd<7>;
+; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd2, grid_const_partial_escape_param_0;
; PTX-NEXT: ld.param.u64 %rd3, [grid_const_partial_escape_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd4, %rd3;
-; PTX-NEXT: mov.b64 %rd5, %rd2;
-; PTX-NEXT: cvta.param.u64 %rd6, %rd5;
-; PTX-NEXT: ld.u32 %r1, [%rd6];
+; PTX-NEXT: cvta.param.u64 %rd5, %rd2;
+; PTX-NEXT: ld.param.u32 %r1, [grid_const_partial_escape_param_0];
; PTX-NEXT: add.s32 %r2, %r1, %r1;
; PTX-NEXT: st.global.u32 [%rd4], %r2;
; PTX-NEXT: mov.b64 %rd1, escape;
; PTX-NEXT: { // callseq 2, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd6;
+; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: .param .b32 retval0;
; PTX-NEXT: prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _);
; PTX-NEXT: call (retval0),
@@ -301,8 +295,8 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape(
; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT1_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4
; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]]
; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT]], align 4
@@ -319,22 +313,21 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input,
; PTX-LABEL: grid_const_partial_escapemem(
; PTX: {
; PTX-NEXT: .reg .b32 %r<6>;
-; PTX-NEXT: .reg .b64 %rd<7>;
+; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd2, grid_const_partial_escapemem_param_0;
; PTX-NEXT: ld.param.u64 %rd3, [grid_const_partial_escapemem_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd4, %rd3;
-; PTX-NEXT: mov.b64 %rd5, %rd2;
-; PTX-NEXT: cvta.param.u64 %rd6, %rd5;
-; PTX-NEXT: ld.u32 %r1, [%rd6];
-; PTX-NEXT: ld.u32 %r2, [%rd6+4];
-; PTX-NEXT: st.global.u64 [%rd4], %rd6;
+; PTX-NEXT: cvta.param.u64 %rd5, %rd2;
+; PTX-NEXT: ld.param.u32 %r1, [grid_const_partial_escapemem_param_0];
+; PTX-NEXT: ld.param.u32 %r2, [grid_const_partial_escapemem_param_0+4];
+; PTX-NEXT: st.global.u64 [%rd4], %rd5;
; PTX-NEXT: add.s32 %r3, %r1, %r2;
; PTX-NEXT: mov.b64 %rd1, escape;
; PTX-NEXT: { // callseq 3, 0
; PTX-NEXT: .param .b64 param0;
-; PTX-NEXT: st.param.b64 [param0], %rd6;
+; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: .param .b32 retval0;
; PTX-NEXT: prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _);
; PTX-NEXT: call (retval0),
@@ -349,8 +342,8 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input,
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4
; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
@@ -374,27 +367,25 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr
; PTX: {
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b32 %r<3>;
-; PTX-NEXT: .reg .b64 %rd<9>;
+; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
-; PTX-NEXT: mov.b64 %rd5, grid_const_phi_param_0;
-; PTX-NEXT: ld.param.u64 %rd6, [grid_const_phi_param_1];
-; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6;
-; PTX-NEXT: mov.b64 %rd7, %rd5;
-; PTX-NEXT: cvta.param.u64 %rd8, %rd7;
+; PTX-NEXT: mov.b64 %rd6, grid_const_phi_param_0;
+; PTX-NEXT: ld.param.u64 %rd5, [grid_const_phi_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd1, %rd5;
; PTX-NEXT: ld.global.u32 %r1, [%rd1];
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
; PTX-NEXT: @%p1 bra $L__BB9_2;
; PTX-NEXT: // %bb.1: // %second
-; PTX-NEXT: add.s64 %rd8, %rd8, 4;
+; PTX-NEXT: add.s64 %rd6, %rd6, 4;
; PTX-NEXT: $L__BB9_2: // %merge
-; PTX-NEXT: ld.u32 %r2, [%rd8];
+; PTX-NEXT: ld.param.u32 %r2, [%rd6];
; PTX-NEXT: st.global.u32 [%rd1], %r2;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_phi(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
@@ -432,32 +423,28 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1,
; PTX: {
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b32 %r<3>;
-; PTX-NEXT: .reg .b64 %rd<12>;
+; PTX-NEXT: .reg .b64 %rd<8>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
-; PTX-NEXT: mov.b64 %rd6, grid_const_phi_ngc_param_0;
-; PTX-NEXT: ld.param.u64 %rd7, [grid_const_phi_ngc_param_2];
-; PTX-NEXT: cvta.to.global.u64 %rd1, %rd7;
-; PTX-NEXT: mov.b64 %rd10, %rd6;
-; PTX-NEXT: cvta.param.u64 %rd11, %rd10;
+; PTX-NEXT: mov.b64 %rd7, grid_const_phi_ngc_param_0;
+; PTX-NEXT: ld.param.u64 %rd6, [grid_const_phi_ngc_param_2];
+; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6;
; PTX-NEXT: ld.global.u32 %r1, [%rd1];
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
; PTX-NEXT: @%p1 bra $L__BB10_2;
; PTX-NEXT: // %bb.1: // %second
-; PTX-NEXT: mov.b64 %rd8, grid_const_phi_ngc_param_1;
-; PTX-NEXT: mov.b64 %rd9, %rd8;
-; PTX-NEXT: cvta.param.u64 %rd2, %rd9;
-; PTX-NEXT: add.s64 %rd11, %rd2, 4;
+; PTX-NEXT: mov.b64 %rd2, grid_const_phi_ngc_param_1;
+; PTX-NEXT: add.s64 %rd7, %rd2, 4;
; PTX-NEXT: $L__BB10_2: // %merge
-; PTX-NEXT: ld.u32 %r2, [%rd11];
+; PTX-NEXT: ld.param.u32 %r2, [%rd7];
; PTX-NEXT: st.global.u32 [%rd1], %r2;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
-; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT2]])
+; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
+; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
@@ -494,29 +481,25 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by
; PTX: {
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b32 %r<3>;
-; PTX-NEXT: .reg .b64 %rd<10>;
+; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, grid_const_select_param_0;
; PTX-NEXT: ld.param.u64 %rd2, [grid_const_select_param_2];
; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
; PTX-NEXT: mov.b64 %rd4, grid_const_select_param_1;
-; PTX-NEXT: mov.b64 %rd5, %rd4;
-; PTX-NEXT: cvta.param.u64 %rd6, %rd5;
-; PTX-NEXT: mov.b64 %rd7, %rd1;
-; PTX-NEXT: cvta.param.u64 %rd8, %rd7;
; PTX-NEXT: ld.global.u32 %r1, [%rd3];
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
-; PTX-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1;
-; PTX-NEXT: ld.u32 %r2, [%rd9];
+; PTX-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
+; PTX-NEXT: ld.param.u32 %r2, [%rd5];
; PTX-NEXT: st.global.u32 [%rd3], %r2;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_select(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
-; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT2]])
+; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
+; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
@@ -535,22 +518,21 @@ define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
; PTX-LABEL: grid_const_ptrtoint(
; PTX: {
; PTX-NEXT: .reg .b32 %r<4>;
-; PTX-NEXT: .reg .b64 %rd<4>;
+; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, grid_const_ptrtoint_param_0;
-; PTX-NEXT: mov.b64 %rd2, %rd1;
; PTX-NEXT: ld.param.u32 %r1, [grid_const_ptrtoint_param_0];
-; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
-; PTX-NEXT: cvt.u32.u64 %r2, %rd3;
+; PTX-NEXT: cvta.param.u64 %rd2, %rd1;
+; PTX-NEXT: cvt.u32.u64 %r2, %rd2;
; PTX-NEXT: add.s32 %r3, %r1, %r2;
; PTX-NEXT: st.param.b32 [func_retval0], %r3;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel i32 @grid_const_ptrtoint(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4
-; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[INPUT2]] to ptr
; OPT-NEXT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
; OPT-NEXT: [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]]
; OPT-NEXT: ret i32 [[KEEPALIVE]]
@@ -565,21 +547,17 @@ declare void @device_func(ptr byval(i32) align 4)
define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
; OPT-LABEL: define ptx_kernel void @test_forward_byval_arg(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT]])
+; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[INPUT_PARAM]] to ptr
; OPT-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT_PARAM_GEN]])
; OPT-NEXT: ret void
;
; PTX-LABEL: test_forward_byval_arg(
; PTX: {
; PTX-NEXT: .reg .b32 %r<2>;
-; PTX-NEXT: .reg .b64 %rd<4>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
-; PTX-NEXT: mov.b64 %rd1, test_forward_byval_arg_param_0;
-; PTX-NEXT: mov.b64 %rd2, %rd1;
-; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
-; PTX-NEXT: ld.u32 %r1, [%rd3];
+; PTX-NEXT: ld.param.u32 %r1, [test_forward_byval_arg_param_0];
; PTX-NEXT: { // callseq 4, 0
; PTX-NEXT: .param .align 4 .b8 param0[4];
; PTX-NEXT: st.param.b32 [param0], %r1;
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index 8fa7d5c3e0cbc..48e02e3dcf305 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -210,7 +210,7 @@ define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) {
; IRC-LABEL: define ptx_kernel void @ptr_as_int_aggr(
; IRC-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
-; IRC-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; IRC-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
; IRC-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S3]], align 8
; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
@@ -220,7 +220,7 @@ define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%st
;
; IRO-LABEL: define ptx_kernel void @ptr_as_int_aggr(
; IRO-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
-; IRO-NEXT: [[S1:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; IRO-NEXT: [[S1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
; IRO-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S1]], align 8
; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 698c37d3c70e2..ce5b42c02bc40 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -32,7 +32,7 @@ define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; LOWER-ARGS-NEXT: ret void
@@ -40,10 +40,7 @@ define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out
; COPY-LABEL: define dso_local ptx_kernel void @read_only(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: [[I:%.*]] = load i32, ptr [[S1]], align 4
+; COPY-NEXT: [[I:%.*]] = load i32, ptr [[S]], align 4
; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; COPY-NEXT: ret void
;
@@ -69,7 +66,7 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4
@@ -78,10 +75,7 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly
; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
+; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 4
; COPY-NEXT: [[I:%.*]] = load i32, ptr [[B]], align 4
; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; COPY-NEXT: ret void
@@ -109,7 +103,7 @@ define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeo
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4
@@ -118,10 +112,7 @@ define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeo
; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
+; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 4
; COPY-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
; COPY-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[ASC]], align 4
; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4
@@ -148,18 +139,40 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; COMMON-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
-; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT: [[ENTRY:.*:]]
-; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
-; COMMON-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; COMMON-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
-; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4
-; COMMON-NEXT: store i32 [[I]], ptr [[OUT]], align 4
-; COMMON-NEXT: ret void
+; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
+; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; SM_60-NEXT: [[ENTRY:.*:]]
+; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; SM_60-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
+; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
+; SM_60-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; SM_60-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
+; SM_60-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4
+; SM_60-NEXT: store i32 [[I]], ptr [[OUT]], align 4
+; SM_60-NEXT: ret void
+;
+; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
+; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; SM_70-NEXT: [[ENTRY:.*:]]
+; SM_70-NEXT: [[S_PARAM:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
+; SM_70-NEXT: [[S_GEN:%.*]] = addrspacecast ptr addrspace(101) [[S_PARAM]] to ptr
+; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S_GEN]], i64 4
+; SM_70-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; SM_70-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
+; SM_70-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4
+; SM_70-NEXT: store i32 [[I]], ptr [[OUT]], align 4
+; SM_70-NEXT: ret void
+;
+; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0(
+; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COPY-NEXT: [[ENTRY:.*:]]
+; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 4
+; COPY-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; COPY-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
+; COPY-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4
+; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4
+; COPY-NEXT: ret void
;
; PTX-LABEL: read_only_gep_asc0(
; PTX: {
@@ -183,32 +196,14 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr(
-; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]]
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr(
-; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]]
-; SM_70-NEXT: ret void
-;
-; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr(
-; COPY-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]]
-; COPY-NEXT: ret void
+; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr(
+; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]]
+; COMMON-NEXT: ret void
;
; PTX-LABEL: escape_ptr(
; PTX: {
@@ -244,35 +239,15 @@ entry:
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
-; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
-; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_60-NEXT: [[ENTRY:.*:]]
-; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
-; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]]
-; SM_60-NEXT: ret void
-;
-; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
-; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; SM_70-NEXT: [[ENTRY:.*:]]
-; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
-; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]]
-; SM_70-NEXT: ret void
-;
-; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
-; COPY-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
-; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]]
-; COPY-NEXT: ret void
+; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
+; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
+; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
+; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]]
+; COMMON-NEXT: ret void
;
; PTX-LABEL: escape_ptr_gep(
; PTX: {
@@ -314,7 +289,7 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: store ptr [[S1]], ptr [[OUT]], align 8
; COMMON-NEXT: ret void
@@ -351,7 +326,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
; COMMON-NEXT: store ptr [[B]], ptr [[OUT]], align 8
@@ -391,7 +366,7 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64
; COMMON-NEXT: store i64 [[I]], ptr [[OUT]], align 8
@@ -429,17 +404,14 @@ define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeo
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true)
; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
-; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true)
+; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S]], i64 16, i1 true)
; COPY-NEXT: ret void
;
; PTX-LABEL: memcpy_from_param(
@@ -492,17 +464,14 @@ define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture nound
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true)
; LOWER-ARGS-NEXT: ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COPY-NEXT: [[ENTRY:.*:]]
-; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 8
-; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[S1]], ptr addrspace(101) align 8 [[S2]], i64 8, i1 false)
-; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true)
+; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S]], i64 16, i1 true)
; COPY-NEXT: ret void
;
; PTX-LABEL: memcpy_from_param_noalign(
@@ -551,12 +520,12 @@ entry:
}
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; COMMON-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
-; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true)
; COMMON-NEXT: ret void
@@ -636,7 +605,7 @@ define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly %
; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[BB:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4
; COMMON-NEXT: store i32 [[I]], ptr [[S1]], align 4
@@ -659,10 +628,10 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; SM_60-NEXT: [[BB:.*:]]
; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4
-; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; SM_60-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT2]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false)
; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
-; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; SM_60-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT1]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
@@ -672,10 +641,10 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; SM_70-LABEL: define ptx_kernel void @test_select(
; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; SM_70-NEXT: [[BB:.*:]]
-; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
-; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; SM_70-NEXT: [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT2]])
+; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr
+; SM_70-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT1]])
+; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4
@@ -685,10 +654,10 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; COPY-NEXT: [[BB:.*:]]
; COPY-NEXT: [[INPUT23:%.*]] = alloca i32, align 4
-; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; COPY-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT2]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
; COPY-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
-; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; COPY-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT1]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; COPY-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
; COPY-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
@@ -719,7 +688,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; PTX_70-NEXT: .reg .pred %p<2>;
; PTX_70-NEXT: .reg .b16 %rs<3>;
; PTX_70-NEXT: .reg .b32 %r<2>;
-; PTX_70-NEXT: .reg .b64 %rd<10>;
+; PTX_70-NEXT: .reg .b64 %rd<6>;
; PTX_70-EMPTY:
; PTX_70-NEXT: // %bb.0: // %bb
; PTX_70-NEXT: ld.param.u8 %rs1, [test_select_param_3];
@@ -729,12 +698,8 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; PTX_70-NEXT: ld.param.u64 %rd2, [test_select_param_2];
; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2;
; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1;
-; PTX_70-NEXT: mov.b64 %rd5, %rd4;
-; PTX_70-NEXT: cvta.param.u64 %rd6, %rd5;
-; PTX_70-NEXT: mov.b64 %rd7, %rd1;
-; PTX_70-NEXT: cvta.param.u64 %rd8, %rd7;
-; PTX_70-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1;
-; PTX_70-NEXT: ld.u32 %r1, [%rd9];
+; PTX_70-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
+; PTX_70-NEXT: ld.param.u32 %r1, [%rd5];
; PTX_70-NEXT: st.global.u32 [%rd3], %r1;
; PTX_70-NEXT: ret;
bb:
@@ -749,10 +714,10 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by
; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; COMMON-NEXT: [[BB:.*:]]
; COMMON-NEXT: [[INPUT23:%.*]] = alloca i32, align 4
-; COMMON-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; COMMON-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT2]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
-; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; COMMON-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT1]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4
@@ -795,10 +760,10 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; SM_60-NEXT: [[BB:.*:]]
; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
-; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; SM_60-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT2]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; SM_60-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT1]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; SM_60: [[FIRST]]:
@@ -816,10 +781,10 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; SM_70-LABEL: define ptx_kernel void @test_phi(
; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; SM_70-NEXT: [[BB:.*:]]
-; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
-; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; SM_70-NEXT: [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT2]])
+; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr
+; SM_70-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT1]])
+; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; SM_70-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; SM_70: [[FIRST]]:
; SM_70-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
@@ -837,10 +802,10 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; COPY-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; COPY-NEXT: [[BB:.*:]]
; COPY-NEXT: [[INPUT23:%.*]] = alloca [[STRUCT_S]], align 8
-; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; COPY-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT2]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT23]], ptr addrspace(101) align 8 [[INPUT24]], i64 8, i1 false)
; COPY-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; COPY-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT1]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; COPY-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; COPY: [[FIRST]]:
@@ -881,25 +846,21 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; PTX_70-NEXT: .reg .pred %p<2>;
; PTX_70-NEXT: .reg .b16 %rs<3>;
; PTX_70-NEXT: .reg .b32 %r<2>;
-; PTX_70-NEXT: .reg .b64 %rd<12>;
+; PTX_70-NEXT: .reg .b64 %rd<8>;
; PTX_70-EMPTY:
; PTX_70-NEXT: // %bb.0: // %bb
; PTX_70-NEXT: ld.param.u8 %rs1, [test_phi_param_3];
; PTX_70-NEXT: and.b16 %rs2, %rs1, 1;
; PTX_70-NEXT: setp.ne.b16 %p1, %rs2, 0;
-; PTX_70-NEXT: mov.b64 %rd6, test_phi_param_0;
-; PTX_70-NEXT: ld.param.u64 %rd7, [test_phi_param_2];
-; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd7;
-; PTX_70-NEXT: mov.b64 %rd10, %rd6;
-; PTX_70-NEXT: cvta.param.u64 %rd11, %rd10;
+; PTX_70-NEXT: mov.b64 %rd7, test_phi_param_0;
+; PTX_70-NEXT: ld.param.u64 %rd6, [test_phi_param_2];
+; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd6;
; PTX_70-NEXT: @%p1 bra $L__BB15_2;
; PTX_70-NEXT: // %bb.1: // %second
-; PTX_70-NEXT: mov.b64 %rd8, test_phi_param_1;
-; PTX_70-NEXT: mov.b64 %rd9, %rd8;
-; PTX_70-NEXT: cvta.param.u64 %rd2, %rd9;
-; PTX_70-NEXT: add.s64 %rd11, %rd2, 4;
+; PTX_70-NEXT: mov.b64 %rd2, test_phi_param_1;
+; PTX_70-NEXT: add.s64 %rd7, %rd2, 4;
; PTX_70-NEXT: $L__BB15_2: // %merge
-; PTX_70-NEXT: ld.u32 %r1, [%rd11];
+; PTX_70-NEXT: ld.param.u32 %r1, [%rd7];
; PTX_70-NEXT: st.global.u32 [%rd1], %r1;
; PTX_70-NEXT: ret;
bb:
@@ -925,10 +886,10 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr
; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; COMMON-NEXT: [[BB:.*:]]
; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
-; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; COMMON-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT2]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; COMMON-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT1]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; COMMON: [[FIRST]]:
@@ -992,7 +953,7 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
; COMMON-LABEL: define ptx_kernel void @test_forward_byval_arg(
; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
; COMMON-NEXT: [[INPUT1:%.*]] = alloca i32, align 4
-; COMMON-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; COMMON-NEXT: [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.noop.addrspacecast.p101.p0(ptr [[INPUT]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false)
; COMMON-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT1]])
; COMMON-NEXT: ret void
@@ -1025,23 +986,15 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
}
define void @device_func(ptr byval(i32) align 4 %input) {
-; LOWER-ARGS-LABEL: define void @device_func(
-; LOWER-ARGS-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
-; LOWER-ARGS-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT]])
-; LOWER-ARGS-NEXT: ret void
-;
-; COPY-LABEL: define void @device_func(
-; COPY-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
-; COPY-NEXT: [[INPUT1:%.*]] = alloca i32, align 4
-; COPY-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false)
-; COPY-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT1]])
-; COPY-NEXT: ret void
+; COMMON-LABEL: define void @device_func(
+; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
+; COMMON-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT]])
+; COMMON-NEXT: ret void
;
; PTX-LABEL: device_func(
; PTX: {
; PTX-NEXT: .reg .b32 %r<2>;
-; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-NEXT: .reg .b64 %rd<2>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.u32 %r1, [device_func_param_0];
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
index e470569bfae19..ad0b11ed6a806 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
@@ -6,7 +6,7 @@
define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct.St8x4) align 8 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: caller_St8x4(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .b64 %rd<13>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
@@ -27,11 +27,11 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct
; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [retval0];
; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [retval0+16];
; CHECK-NEXT: } // callseq 0
-; CHECK-NEXT: ld.param.u32 %r3, [caller_St8x4_param_1];
-; CHECK-NEXT: st.u64 [%r3], %rd5;
-; CHECK-NEXT: st.u64 [%r3+8], %rd6;
-; CHECK-NEXT: st.u64 [%r3+16], %rd7;
-; CHECK-NEXT: st.u64 [%r3+24], %rd8;
+; CHECK-NEXT: ld.param.u32 %r2, [caller_St8x4_param_1];
+; CHECK-NEXT: st.u64 [%r2], %rd5;
+; CHECK-NEXT: st.u64 [%r2+8], %rd6;
+; CHECK-NEXT: st.u64 [%r2+16], %rd7;
+; CHECK-NEXT: st.u64 [%r2+24], %rd8;
; CHECK-NEXT: ret;
%call = tail call fastcc [4 x i64] @callee_St8x4(ptr noundef nonnull byval(%struct.St8x4) align 8 %in) #2
%.fca.0.extract = extractvalue [4 x i64] %call, 0
More information about the llvm-commits
mailing list