[llvm] [NVTPX] Copy kernel arguments as byte array (PR #110356)
Michael Kuron via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 29 07:47:49 PDT 2024
https://github.com/mkuron updated https://github.com/llvm/llvm-project/pull/110356
>From fc56a3cd26e7403625f0b98e81f495b06b9d29c0 Mon Sep 17 00:00:00 2001
From: Michael Kuron <m.kuron at gmx.de>
Date: Sat, 28 Sep 2024 12:57:43 +0200
Subject: [PATCH 1/2] [NVTPX] Copy kernel arguments as byte array
Ensures that struct padding is not skipped, as it may contain actual
data if the struct is really a union.
Fixes #53710
---
llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 11 ++-
llvm/test/CodeGen/NVPTX/lower-args.ll | 2 +-
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 84 +++++++++++++--------
3 files changed, 62 insertions(+), 35 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 082546c4dd72f8..7fc6b8949f8a60 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -626,10 +626,17 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
// Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
// addrspacecast preserves alignment. Since params are constant, this load
// is definitely not volatile.
+ const auto StructBytes = *AllocA->getAllocationSize(DL);
+ const auto ChunkBytes = (StructBytes % 8 == 0) ? 8 :
+ (StructBytes % 4 == 0) ? 4 :
+ (StructBytes % 2 == 0) ? 2 : 1;
+ Type *ChunkType = Type::getIntNTy(Func->getContext(), 8 * ChunkBytes);
+ Type *OpaqueType = ArrayType::get(ChunkType, StructBytes / ChunkBytes);
LoadInst *LI =
- new LoadInst(StructType, ArgInParam, Arg->getName(),
+ new LoadInst(OpaqueType, ArgInParam, Arg->getName(),
/*isVolatile=*/false, AllocA->getAlign(), FirstInst);
- new StoreInst(LI, AllocA, FirstInst);
+ new StoreInst(LI, AllocA,
+ /*isVolatile=*/false, AllocA->getAlign(), FirstInst);
}
}
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index 029f1944d596b3..9a306036044be4 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -14,7 +14,7 @@ target triple = "nvptx64-nvidia-cuda"
; COMMON-LABEL: load_alignment
define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) {
entry:
-; IR: load %class.outer, ptr addrspace(101)
+; IR: load [3 x i64], ptr addrspace(101)
; IR-SAME: align 8
; PTX: ld.param.u64
; PTX-NOT: ld.param.u8
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index a414a6c41cd5b2..5b55e3c5b7280d 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -88,8 +88,8 @@ define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out,
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
@@ -115,8 +115,8 @@ define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr nound
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]])
@@ -134,8 +134,8 @@ define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr n
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
@@ -155,8 +155,8 @@ define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, pt
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; COMMON-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8
@@ -174,8 +174,8 @@ define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
@@ -195,8 +195,8 @@ define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64
@@ -232,8 +232,8 @@ define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr n
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
@@ -251,8 +251,8 @@ define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr noc
; COMMON-NEXT: [[BB:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4
@@ -273,12 +273,12 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2,
; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr
; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4
; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; SM_60-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4
-; SM_60-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4
+; SM_60-NEXT: [[INPUT26:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT25]], align 4
+; SM_60-NEXT: store [1 x i32] [[INPUT26]], ptr [[INPUT24]], align 4
; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; SM_60-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4
-; SM_60-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4
+; SM_60-NEXT: [[INPUT13:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT12]], align 4
+; SM_60-NEXT: store [1 x i32] [[INPUT13]], ptr [[INPUT11]], align 4
; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4
@@ -313,12 +313,12 @@ define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
; COMMON-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr
; COMMON-NEXT: [[INPUT24:%.*]] = alloca i32, align 4
; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; COMMON-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4
-; COMMON-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4
+; COMMON-NEXT: [[INPUT26:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT25]], align 4
+; COMMON-NEXT: store [1 x i32] [[INPUT26]], ptr [[INPUT24]], align 4
; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; COMMON-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4
-; COMMON-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4
+; COMMON-NEXT: [[INPUT13:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT12]], align 4
+; COMMON-NEXT: store [1 x i32] [[INPUT13]], ptr [[INPUT11]], align 4
; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4
; COMMON-NEXT: ret void
@@ -337,12 +337,12 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S)
; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr
; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; SM_60-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
-; SM_60-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
+; SM_60-NEXT: [[INPUT26:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT25]], align 8
+; SM_60-NEXT: store [1 x i64] [[INPUT26]], ptr [[INPUT24]], align 8
; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; SM_60-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4
-; SM_60-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4
+; SM_60-NEXT: [[INPUT13:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT12]], align 4
+; SM_60-NEXT: store [1 x i64] [[INPUT13]], ptr [[INPUT11]], align 4
; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; SM_60: [[FIRST]]:
; SM_60-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
@@ -402,12 +402,12 @@ define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%str
; COMMON-NEXT: [[BB:.*:]]
; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; COMMON-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
-; COMMON-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
+; COMMON-NEXT: [[INPUT26:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT25]], align 8
+; COMMON-NEXT: store [1 x i64] [[INPUT26]], ptr [[INPUT24]], align 8
; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; COMMON-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4
-; COMMON-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4
+; COMMON-NEXT: [[INPUT13:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT12]], align 4
+; COMMON-NEXT: store [1 x i64] [[INPUT13]], ptr [[INPUT11]], align 4
; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; COMMON: [[FIRST]]:
; COMMON-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
@@ -437,6 +437,26 @@ merge: ; preds = %second, %first
ret void
}
+%union.U = type { %struct.P }
+%struct.P = type { i8, i32 }
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @padding(ptr nocapture noundef readnone %out, ptr noundef byval(%union.U) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @padding(
+; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[UNION_U:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S1:%.*]] = alloca [[UNION_U]], align 4
+; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S3:%.*]] = load [1 x i64], ptr addrspace(101) [[S2]], align 4
+; COMMON-NEXT: store [1 x i64] [[S3]], ptr [[S1]], align 4
+; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]])
+; COMMON-NEXT: ret void
+;
+entry:
+ call void @_Z6escapePv(ptr noundef nonnull %s) #0
+ ret void
+}
+
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
>From 6a83ec67a7870c4d056a06f730205ceb4047d47d Mon Sep 17 00:00:00 2001
From: Michael Kuron <m.kuron at gmx.de>
Date: Sun, 29 Sep 2024 13:44:13 +0200
Subject: [PATCH 2/2] [NVPTX] Coalesce kernel argument copies
---
llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 25 ++++--
llvm/test/CodeGen/NVPTX/lower-args.ll | 3 +-
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 88 +++++++++++++--------
3 files changed, 75 insertions(+), 41 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 7fc6b8949f8a60..8604cf68e40620 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -623,15 +623,28 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
Value *ArgInParam = new AddrSpaceCastInst(
Arg, PointerType::get(Arg->getContext(), ADDRESS_SPACE_PARAM),
Arg->getName(), FirstInst);
+ // Create an opaque type of same size as StructType but without padding
+ // holes as this could have been a union.
+ const auto StructBytes = *AllocA->getAllocationSize(DL);
+ SmallVector<Type *, 5> ChunkTypes;
+ if (StructBytes >= 16) {
+ Type *IntType = Type::getInt64Ty(Func->getContext());
+ Type *ChunkType = VectorType::get(IntType, 2, false);
+ Type *OpaqueType = StructBytes < 32 ? ChunkType :
+ ArrayType::get(ChunkType, StructBytes / 16);
+ ChunkTypes.push_back(OpaqueType);
+ }
+ for (const auto ChunkBytes: {8, 4, 2, 1}) {
+ if (StructBytes & ChunkBytes) {
+ Type *ChunkType = Type::getIntNTy(Func->getContext(), 8 * ChunkBytes);
+ ChunkTypes.push_back(ChunkType);
+ }
+ }
+ Type * OpaqueType = ChunkTypes.size() == 1 ? ChunkTypes[0] :
+ StructType::create(ChunkTypes);
// Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
// addrspacecast preserves alignment. Since params are constant, this load
// is definitely not volatile.
- const auto StructBytes = *AllocA->getAllocationSize(DL);
- const auto ChunkBytes = (StructBytes % 8 == 0) ? 8 :
- (StructBytes % 4 == 0) ? 4 :
- (StructBytes % 2 == 0) ? 2 : 1;
- Type *ChunkType = Type::getIntNTy(Func->getContext(), 8 * ChunkBytes);
- Type *OpaqueType = ArrayType::get(ChunkType, StructBytes / ChunkBytes);
LoadInst *LI =
new LoadInst(OpaqueType, ArgInParam, Arg->getName(),
/*isVolatile=*/false, AllocA->getAlign(), FirstInst);
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index 9a306036044be4..17b04b5a601249 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -7,6 +7,7 @@
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
+; IR: [[OPAQUE_OUTER:.*]] = type { <2 x i64>, i64 }
%class.outer = type <{ %class.inner, i32, [4 x i8] }>
%class.inner = type { ptr, ptr }
@@ -14,7 +15,7 @@ target triple = "nvptx64-nvidia-cuda"
; COMMON-LABEL: load_alignment
define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) {
entry:
-; IR: load [3 x i64], ptr addrspace(101)
+; IR: load [[OPAQUE_OUTER]], ptr addrspace(101)
; IR-SAME: align 8
; PTX: ld.param.u64
; PTX-NOT: ld.param.u8
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 5b55e3c5b7280d..7dc5df76bd3329 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -5,6 +5,7 @@ source_filename = "<stdin>"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
+; COMMON: [[OPAQUE_C:.*]] = type { [2 x <2 x i64>], i64, i32, i8 }
%struct.S = type { i32, i32 }
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
@@ -88,8 +89,8 @@ define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out,
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
@@ -115,8 +116,8 @@ define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr nound
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]])
@@ -134,8 +135,8 @@ define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr n
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
@@ -155,8 +156,8 @@ define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, pt
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; COMMON-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8
@@ -174,8 +175,8 @@ define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
@@ -195,8 +196,8 @@ define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64
@@ -232,8 +233,8 @@ define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr n
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
@@ -251,8 +252,8 @@ define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr noc
; COMMON-NEXT: [[BB:.*:]]
; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S5:%.*]] = load [1 x i64], ptr addrspace(101) [[S4]], align 4
-; COMMON-NEXT: store [1 x i64] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[S5:%.*]] = load i64, ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store i64 [[S5]], ptr [[S3]], align 4
; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4
@@ -273,12 +274,12 @@ define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2,
; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr
; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4
; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; SM_60-NEXT: [[INPUT26:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT25]], align 4
-; SM_60-NEXT: store [1 x i32] [[INPUT26]], ptr [[INPUT24]], align 4
+; SM_60-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4
+; SM_60-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4
; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; SM_60-NEXT: [[INPUT13:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT12]], align 4
-; SM_60-NEXT: store [1 x i32] [[INPUT13]], ptr [[INPUT11]], align 4
+; SM_60-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4
+; SM_60-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4
; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4
@@ -313,12 +314,12 @@ define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %i
; COMMON-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr
; COMMON-NEXT: [[INPUT24:%.*]] = alloca i32, align 4
; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; COMMON-NEXT: [[INPUT26:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT25]], align 4
-; COMMON-NEXT: store [1 x i32] [[INPUT26]], ptr [[INPUT24]], align 4
+; COMMON-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4
+; COMMON-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4
; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; COMMON-NEXT: [[INPUT13:%.*]] = load [1 x i32], ptr addrspace(101) [[INPUT12]], align 4
-; COMMON-NEXT: store [1 x i32] [[INPUT13]], ptr [[INPUT11]], align 4
+; COMMON-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4
+; COMMON-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4
; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4
; COMMON-NEXT: ret void
@@ -337,12 +338,12 @@ define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S)
; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr
; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; SM_60-NEXT: [[INPUT26:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT25]], align 8
-; SM_60-NEXT: store [1 x i64] [[INPUT26]], ptr [[INPUT24]], align 8
+; SM_60-NEXT: [[INPUT26:%.*]] = load i64, ptr addrspace(101) [[INPUT25]], align 8
+; SM_60-NEXT: store i64 [[INPUT26]], ptr [[INPUT24]], align 8
; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; SM_60-NEXT: [[INPUT13:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT12]], align 4
-; SM_60-NEXT: store [1 x i64] [[INPUT13]], ptr [[INPUT11]], align 4
+; SM_60-NEXT: [[INPUT13:%.*]] = load i64, ptr addrspace(101) [[INPUT12]], align 4
+; SM_60-NEXT: store i64 [[INPUT13]], ptr [[INPUT11]], align 4
; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; SM_60: [[FIRST]]:
; SM_60-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
@@ -402,12 +403,12 @@ define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%str
; COMMON-NEXT: [[BB:.*:]]
; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; COMMON-NEXT: [[INPUT26:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT25]], align 8
-; COMMON-NEXT: store [1 x i64] [[INPUT26]], ptr [[INPUT24]], align 8
+; COMMON-NEXT: [[INPUT26:%.*]] = load i64, ptr addrspace(101) [[INPUT25]], align 8
+; COMMON-NEXT: store i64 [[INPUT26]], ptr [[INPUT24]], align 8
; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; COMMON-NEXT: [[INPUT13:%.*]] = load [1 x i64], ptr addrspace(101) [[INPUT12]], align 4
-; COMMON-NEXT: store [1 x i64] [[INPUT13]], ptr [[INPUT11]], align 4
+; COMMON-NEXT: [[INPUT13:%.*]] = load i64, ptr addrspace(101) [[INPUT12]], align 4
+; COMMON-NEXT: store i64 [[INPUT13]], ptr [[INPUT11]], align 4
; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; COMMON: [[FIRST]]:
; COMMON-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
@@ -447,8 +448,27 @@ define dso_local void @padding(ptr nocapture noundef readnone %out, ptr noundef
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[UNION_U]], align 4
; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
-; COMMON-NEXT: [[S3:%.*]] = load [1 x i64], ptr addrspace(101) [[S2]], align 4
-; COMMON-NEXT: store [1 x i64] [[S3]], ptr [[S1]], align 4
+; COMMON-NEXT: [[S3:%.*]] = load i64, ptr addrspace(101) [[S2]], align 4
+; COMMON-NEXT: store i64 [[S3]], ptr [[S1]], align 4
+; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]])
+; COMMON-NEXT: ret void
+;
+entry:
+ call void @_Z6escapePv(ptr noundef nonnull %s) #0
+ ret void
+}
+
+%struct.C = type { [45 x i8] }
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @coalescing(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.C) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @coalescing(
+; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_C:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_C]], align 4
+; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S3:%.*]] = load [[OPAQUE_C]], ptr addrspace(101) [[S2]], align 4
+; COMMON-NEXT: store [[OPAQUE_C]] [[S3]], ptr [[S1]], align 4
; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]])
; COMMON-NEXT: ret void
;
More information about the llvm-commits
mailing list