r259122 - [CUDA] Generate CUDA's printf alloca in its function's entry block.
Justin Lebar via cfe-commits
cfe-commits at lists.llvm.org
Thu Jan 28 15:58:29 PST 2016
Author: jlebar
Date: Thu Jan 28 17:58:28 2016
New Revision: 259122
URL: http://llvm.org/viewvc/llvm-project?rev=259122&view=rev
Log:
[CUDA] Generate CUDA's printf alloca in its function's entry block.
Summary:
This is necessary to prevent llvm from generating stacksave intrinsics
around this alloca. NVVM doesn't have a stack, and we don't handle said
intrinsics.
Reviewers: rnk, echristo
Subscribers: cfe-commits, jhen, tra
Differential Revision: http://reviews.llvm.org/D16664
Modified:
cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp
cfe/trunk/test/CodeGenCUDA/printf.cu
Modified: cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp?rev=259122&r1=259121&r2=259122&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp Thu Jan 28 17:58:28 2016
@@ -52,10 +52,13 @@ static llvm::Function *GetVprintfDeclara
//
// is converted into something resembling
//
-// char* buf = alloca(...);
-// *reinterpret_cast<Arg1*>(buf) = arg1;
-// *reinterpret_cast<Arg2*>(buf + ...) = arg2;
-// *reinterpret_cast<Arg3*>(buf + ...) = arg3;
+// struct Tmp {
+// Arg1 a1;
+// Arg2 a2;
+// Arg3 a3;
+// };
+// char* buf = alloca(sizeof(Tmp));
+// *(Tmp*)buf = {a1, a2, a3};
// vprintf("format string", buf);
//
// buf is aligned to the max of {alignof(Arg1), ...}. Furthermore, each of the
@@ -80,48 +83,24 @@ CodeGenFunction::EmitCUDADevicePrintfCal
E->arguments(), E->getDirectCallee(),
/* ParamsToSkip = */ 0);
- // Figure out how large of a buffer we need to hold our varargs and how
- // aligned the buffer needs to be. We start iterating at Arg[1], because
- // that's our first vararg.
- unsigned BufSize = 0;
- unsigned BufAlign = 0;
- for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) {
- const RValue& RV = Args[I].RV;
- llvm::Type* Ty = RV.getScalarVal()->getType();
-
- auto Align = DL.getPrefTypeAlignment(Ty);
- BufAlign = std::max(BufAlign, Align);
- // Add padding required to keep the current arg aligned.
- BufSize = llvm::alignTo(BufSize, Align);
- BufSize += DL.getTypeAllocSize(Ty);
- }
-
- // Construct and fill the buffer.
- llvm::Value* BufferPtr = nullptr;
- if (BufSize == 0) {
+ // Construct and fill the args buffer that we'll pass to vprintf.
+ llvm::Value *BufferPtr;
+ if (Args.size() <= 1) {
// If there are no args, pass a null pointer to vprintf.
BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx));
} else {
- BufferPtr = Builder.Insert(new llvm::AllocaInst(
- llvm::Type::getInt8Ty(Ctx), llvm::ConstantInt::get(Int32Ty, BufSize),
- BufAlign, "printf_arg_buf"));
+ llvm::SmallVector<llvm::Type *, 8> ArgTypes;
+ for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I)
+ ArgTypes.push_back(Args[I].RV.getScalarVal()->getType());
+ llvm::Type *AllocaTy = llvm::StructType::create(ArgTypes, "printf_args");
+ llvm::Value *Alloca = CreateTempAlloca(AllocaTy);
- unsigned Offset = 0;
for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) {
+ llvm::Value *P = Builder.CreateStructGEP(AllocaTy, Alloca, I - 1);
llvm::Value *Arg = Args[I].RV.getScalarVal();
- llvm::Type *Ty = Arg->getType();
- auto Align = DL.getPrefTypeAlignment(Ty);
-
- // Pad the buffer to Arg's alignment.
- Offset = llvm::alignTo(Offset, Align);
-
- // Store Arg into the buffer at Offset.
- llvm::Value *GEP =
- Builder.CreateGEP(BufferPtr, llvm::ConstantInt::get(Int32Ty, Offset));
- llvm::Value *Cast = Builder.CreateBitCast(GEP, Ty->getPointerTo());
- Builder.CreateAlignedStore(Arg, Cast, Align);
- Offset += DL.getTypeAllocSize(Ty);
+ Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlignment(Arg->getType()));
}
+ BufferPtr = Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx));
}
// Invoke vprintf and return.
Modified: cfe/trunk/test/CodeGenCUDA/printf.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenCUDA/printf.cu?rev=259122&r1=259121&r2=259122&view=diff
==============================================================================
--- cfe/trunk/test/CodeGenCUDA/printf.cu (original)
+++ cfe/trunk/test/CodeGenCUDA/printf.cu Thu Jan 28 17:58:28 2016
@@ -9,45 +9,35 @@
extern "C" __device__ int vprintf(const char*, const char*);
// Check a simple call to printf end-to-end.
+// CHECK: [[SIMPLE_PRINTF_TY:%[a-zA-Z0-9_]+]] = type { i32, i64, double }
__device__ int CheckSimple() {
+ // CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca [[SIMPLE_PRINTF_TY]]
// CHECK: [[FMT:%[0-9]+]] = load{{.*}}%fmt
- const char* fmt = "%d";
- // CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca i8, i32 4, align 4
- // CHECK: [[PTR:%[0-9]+]] = getelementptr i8, i8* [[BUF]], i32 0
- // CHECK: [[CAST:%[0-9]+]] = bitcast i8* [[PTR]] to i32*
- // CHECK: store i32 42, i32* [[CAST]], align 4
- // CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8* [[FMT]], i8* [[BUF]])
+ const char* fmt = "%d %lld %f";
+ // CHECK: [[PTR0:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 0
+ // CHECK: store i32 1, i32* [[PTR0]], align 4
+ // CHECK: [[PTR1:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 1
+ // CHECK: store i64 2, i64* [[PTR1]], align 8
+ // CHECK: [[PTR2:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 2
+ // CHECK: store double 3.0{{[^,]*}}, double* [[PTR2]], align 8
+ // CHECK: [[BUF_CAST:%[0-9]+]] = bitcast [[SIMPLE_PRINTF_TY]]* [[BUF]] to i8*
+ // CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8* [[FMT]], i8* [[BUF_CAST]])
// CHECK: ret i32 [[RET]]
- return printf(fmt, 42);
-}
-
-// Check that the args' types are promoted correctly when we call printf.
-__device__ void CheckTypes() {
- // CHECK: alloca {{.*}} align 8
- // CHECK: getelementptr {{.*}} i32 0
- // CHECK: bitcast {{.*}} to i32*
- // CHECK: getelementptr {{.*}} i32 4
- // CHECK: bitcast {{.*}} to i32*
- // CHECK: getelementptr {{.*}} i32 8
- // CHECK: bitcast {{.*}} to double*
- // CHECK: getelementptr {{.*}} i32 16
- // CHECK: bitcast {{.*}} to double*
- printf("%d %d %f %f", (char)1, (short)2, 3.0f, 4.0);
-}
-
-// Check that the args are aligned properly in the buffer.
-__device__ void CheckAlign() {
- // CHECK: alloca i8, i32 40, align 8
- // CHECK: getelementptr {{.*}} i32 0
- // CHECK: getelementptr {{.*}} i32 8
- // CHECK: getelementptr {{.*}} i32 16
- // CHECK: getelementptr {{.*}} i32 20
- // CHECK: getelementptr {{.*}} i32 24
- // CHECK: getelementptr {{.*}} i32 32
- printf("%d %f %d %d %d %lld", 1, 2.0, 3, 4, 5, (long long)6);
+ return printf(fmt, 1, 2ll, 3.0);
}
__device__ void CheckNoArgs() {
// CHECK: call i32 @vprintf({{.*}}, i8* null){{$}}
printf("hello, world!");
}
+
+// Check that printf's alloca happens in the entry block, not inside the if
+// statement.
+__device__ bool foo();
+__device__ void CheckAllocaIsInEntryBlock() {
+ // CHECK: alloca %printf_args
+ // CHECK: call {{.*}} @_Z3foov()
+ if (foo()) {
+ printf("%d", 42);
+ }
+}
More information about the cfe-commits
mailing list