[clang] [llvm] [WIP][AMDGPU] Enable hostcall printf for OpenCL (PR #70932)
Vikram Hegde via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 1 23:56:16 PDT 2023
https://github.com/vikramRH updated https://github.com/llvm/llvm-project/pull/70932
>From 4c0467078b2f38e814569ad351f86129d1c1d5ee Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Wed, 4 Oct 2023 05:41:47 -0400
Subject: [PATCH] [WIP][AMDGPU] hostcall printf support for OpenCL
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 8 +
clang/include/clang/Basic/TargetOptions.h | 10 +-
clang/include/clang/Driver/Options.td | 8 +-
clang/lib/AST/Decl.cpp | 7 +
clang/lib/Basic/Targets/AMDGPU.cpp | 2 +
clang/lib/CodeGen/CGBuiltin.cpp | 8 +-
clang/lib/CodeGen/CGGPUBuiltin.cpp | 27 ++-
clang/lib/CodeGen/CodeGenModule.cpp | 4 +-
clang/lib/Driver/ToolChains/Clang.cpp | 10 +
.../CodeGenHIP/printf-kind-module-flag.hip | 6 +-
clang/test/CodeGenOpenCL/amdgpu-printf.cl | 205 +++++++++++++++++-
.../lib/Transforms/Utils/AMDGPUEmitPrintf.cpp | 135 +++++++-----
12 files changed, 359 insertions(+), 71 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 532a91fd903e87c..b5e8be145b03a0d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -21,6 +21,10 @@
#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
# define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
#endif
+
+#if defined(BUILTIN) && !defined(LANGBUILTIN)
+#define LANGBUILTIN(ID, TYPE, ATTRS, BUILTIN_LANG) BUILTIN(ID, TYPE, ATTRS)
+#endif
//===----------------------------------------------------------------------===//
// SI+ only builtins.
//===----------------------------------------------------------------------===//
@@ -402,5 +406,9 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts")
+// OpenCL
+LANGBUILTIN(printf, "icC*4.", "fp:0:", ALL_OCL_LANGUAGES)
+
#undef BUILTIN
#undef TARGET_BUILTIN
+#undef LANGBUILTIN
diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h
index 8bb03249b7f8308..8ff07783b0dd5d1 100644
--- a/clang/include/clang/Basic/TargetOptions.h
+++ b/clang/include/clang/Basic/TargetOptions.h
@@ -92,16 +92,20 @@ class TargetOptions {
/// \brief Enumeration values for AMDGPU printf lowering scheme
enum class AMDGPUPrintfKind {
+ /// Use deafult lowering scheme, HIP programs use hostcall and OpenCL uses
+ /// buffered by default,
+ None = 0,
+
/// printf lowering scheme involving hostcalls, currently used by HIP
/// programs by default
- Hostcall = 0,
+ Hostcall = 1,
/// printf lowering scheme involving implicit printf buffers,
- Buffered = 1,
+ Buffered = 2,
};
/// \brief AMDGPU Printf lowering scheme
- AMDGPUPrintfKind AMDGPUPrintfKindVal = AMDGPUPrintfKind::Hostcall;
+ AMDGPUPrintfKind AMDGPUPrintfKindVal = AMDGPUPrintfKind::None;
// The code model to be used as specified by the user. Corresponds to
// CodeModel::Model enum defined in include/llvm/Support/CodeGen.h, plus
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index b1229b2f4562379..d62cfe8961db90a 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1251,15 +1251,17 @@ def emit_static_lib : Flag<["--"], "emit-static-lib">,
def mprintf_kind_EQ : Joined<["-"], "mprintf-kind=">, Group<m_Group>,
HelpText<"Specify the printf lowering scheme (AMDGPU only), allowed values are "
+ "\"none\" (Use default lowering scheme for a language, HIP uses hostcalls and "
+ "OpenCL uses Buffered scheme), "
"\"hostcall\"(printing happens during kernel execution, this scheme "
"relies on hostcalls which require system to support pcie atomics) "
"and \"buffered\"(printing happens after all kernel threads exit, "
"this uses a printf buffer and does not rely on pcie atomic support)">,
Visibility<[ClangOption, CC1Option]>,
- Values<"hostcall,buffered">,
+ Values<"none,hostcall,buffered">,
NormalizedValuesScope<"TargetOptions::AMDGPUPrintfKind">,
- NormalizedValues<["Hostcall", "Buffered"]>,
- MarshallingInfoEnum<TargetOpts<"AMDGPUPrintfKindVal">, "Hostcall">;
+ NormalizedValues<["None", "Hostcall", "Buffered"]>,
+ MarshallingInfoEnum<TargetOpts<"AMDGPUPrintfKindVal">, "None">;
// HIP options
let Group = hip_Group in {
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 6efc177d61c03ba..b99376e42b8e7ba 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -49,6 +49,7 @@
#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Basic/Specifiers.h"
+#include "clang/Basic/TargetBuiltins.h"
#include "clang/Basic/TargetCXXABI.h"
#include "clang/Basic/TargetInfo.h"
#include "clang/Basic/Visibility.h"
@@ -3585,6 +3586,12 @@ unsigned FunctionDecl::getBuiltinID(bool ConsiderWrapperFunctions) const {
if (!ConsiderWrapperFunctions && getStorageClass() == SC_Static)
return 0;
+ // AMDGCN implementation supports printf as a special case even
+ // for OpenCL
+ if (Context.getTargetInfo().getTriple().isAMDGCN() &&
+ Context.getLangOpts().OpenCL && BuiltinID == AMDGPU::BIprintf)
+ return BuiltinID;
+
// OpenCL v1.2 s6.9.f - The library functions defined in
// the C99 standard headers are not available.
if (Context.getLangOpts().OpenCL &&
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 409ae32ab424215..307cfa49f54e926 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -91,6 +91,8 @@ static constexpr Builtin::Info BuiltinInfo[] = {
{#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \
{#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
+#define LANGBUILTIN(ID, TYPE, ATTRS, LANG) \
+ {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, LANG},
#include "clang/Basic/BuiltinsAMDGPU.def"
};
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e047d31c012116f..803692bf5842ccd 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2370,6 +2370,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
&getTarget().getLongDoubleFormat() == &llvm::APFloat::IEEEquad())
BuiltinID = mutateLongDoubleBuiltin(BuiltinID);
+ // Mutate the pritnf builtin ID since we use the same CodeGen path for
+ // HIP and OpenCL
+ if (getTarget().getTriple().isAMDGCN() && BuiltinID == AMDGPU::BIprintf)
+ BuiltinID = Builtin::BIprintf;
+
// If the builtin has been declared explicitly with an assembler label,
// disable the specialized emitting below. Ideally we should communicate the
// rename in IR, or at least avoid generating the intrinsic calls that are
@@ -5529,7 +5534,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
return EmitOpenMPDevicePrintfCallExpr(E);
if (getTarget().getTriple().isNVPTX())
return EmitNVPTXDevicePrintfCallExpr(E);
- if (getTarget().getTriple().isAMDGCN() && getLangOpts().HIP)
+ if (getTarget().getTriple().isAMDGCN() &&
+ (getLangOpts().HIP || getLangOpts().OpenCL))
return EmitAMDGPUDevicePrintfCallExpr(E);
}
diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp b/clang/lib/CodeGen/CGGPUBuiltin.cpp
index 75fb06de938425d..04d7d063df7ac9d 100644
--- a/clang/lib/CodeGen/CGGPUBuiltin.cpp
+++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp
@@ -13,6 +13,7 @@
#include "CodeGenFunction.h"
#include "clang/Basic/Builtins.h"
+#include "clang/Basic/TargetBuiltins.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Instruction.h"
#include "llvm/Support/MathExtras.h"
@@ -176,10 +177,20 @@ RValue CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E) {
E, this, GetVprintfDeclaration(CGM.getModule()), false);
}
+// Deterimines if an argument is a string
+static bool isString(const clang::Type *argXTy) {
+ if ((argXTy->isPointerType() || argXTy->isConstantArrayType()) &&
+ argXTy->getPointeeOrArrayElementType()->isCharType())
+ return true;
+ else
+ return false;
+}
+
RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) {
assert(getTarget().getTriple().getArch() == llvm::Triple::amdgcn);
assert(E->getBuiltinCallee() == Builtin::BIprintf ||
- E->getBuiltinCallee() == Builtin::BI__builtin_printf);
+ E->getBuiltinCallee() == Builtin::BI__builtin_printf ||
+ E->getBuiltinCallee() == AMDGPU::BIprintf);
assert(E->getNumArgs() >= 1); // printf always has at least one arg.
CallArgList CallArgs;
@@ -187,6 +198,8 @@ RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) {
E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
E->arguments(), E->getDirectCallee(),
/* ParamsToSkip = */ 0);
+ llvm::IRBuilder<> IRB(Builder.GetInsertBlock(), Builder.GetInsertPoint());
+ IRB.SetCurrentDebugLocation(Builder.getCurrentDebugLocation());
SmallVector<llvm::Value *, 8> Args;
for (const auto &A : CallArgs) {
@@ -197,14 +210,16 @@ RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) {
}
llvm::Value *Arg = A.getRValue(*this).getScalarVal();
+ if (isString(A.getType().getTypePtr()) && CGM.getLangOpts().OpenCL)
+ Arg = Builder.CreateAddrSpaceCast(Arg, CGM.Int8PtrTy);
Args.push_back(Arg);
}
- llvm::IRBuilder<> IRB(Builder.GetInsertBlock(), Builder.GetInsertPoint());
- IRB.SetCurrentDebugLocation(Builder.getCurrentDebugLocation());
-
- bool isBuffered = (CGM.getTarget().getTargetOpts().AMDGPUPrintfKindVal ==
- clang::TargetOptions::AMDGPUPrintfKind::Buffered);
+ auto PFK = CGM.getTarget().getTargetOpts().AMDGPUPrintfKindVal;
+ bool isBuffered =
+ ((PFK == clang::TargetOptions::AMDGPUPrintfKind::Buffered) ||
+ (CGM.getLangOpts().OpenCL &&
+ (PFK == clang::TargetOptions::AMDGPUPrintfKind::None)));
auto Printf = llvm::emitAMDGPUPrintfCall(IRB, Args, isBuffered);
Builder.SetInsertPoint(IRB.GetInsertBlock(), IRB.GetInsertPoint());
return RValue::get(Printf);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index cc81a68b15c4324..9b6a276260823b8 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -855,8 +855,8 @@ void CodeGenModule::Release() {
// Currently, "-mprintf-kind" option is only supported for HIP
if (LangOpts.HIP) {
auto *MDStr = llvm::MDString::get(
- getLLVMContext(), (getTarget().getTargetOpts().AMDGPUPrintfKindVal ==
- TargetOptions::AMDGPUPrintfKind::Hostcall)
+ getLLVMContext(), (getTarget().getTargetOpts().AMDGPUPrintfKindVal !=
+ TargetOptions::AMDGPUPrintfKind::Buffered)
? "hostcall"
: "buffered");
getModule().addModuleFlag(llvm::Module::Error, "amdgpu_printf_kind",
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 79f7fba22570746..c2aa1b1db652188 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4742,6 +4742,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.ClaimAllArgs(options::OPT_gen_cdb_fragment_path);
}
+ if (TC.getTriple().isAMDGPU() && types::isOpenCL(Input.getType())) {
+ if (Args.getLastArg(options::OPT_mprintf_kind_EQ)) {
+ CmdArgs.push_back(Args.MakeArgString(
+ "-mprintf-kind=" +
+ Args.getLastArgValue(options::OPT_mprintf_kind_EQ)));
+ // Force compiler error on invalid conversion specifiers
+ CmdArgs.push_back(Args.MakeArgString("-Werror=format-invalid-specifier"));
+ }
+ }
+
if (IsCuda || IsHIP) {
// We have to pass the triple of the host if compiling for a CUDA/HIP device
// and vice-versa.
diff --git a/clang/test/CodeGenHIP/printf-kind-module-flag.hip b/clang/test/CodeGenHIP/printf-kind-module-flag.hip
index a47262416c2b800..3a20f043990689d 100644
--- a/clang/test/CodeGenHIP/printf-kind-module-flag.hip
+++ b/clang/test/CodeGenHIP/printf-kind-module-flag.hip
@@ -9,9 +9,9 @@
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
// RUN: -mprintf-kind=buffered -o - %s | FileCheck -check-prefix=BUFFERED %s
-// RUN: not %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
-// RUN: -mprintf-kind=none -o - %s 2>&1| FileCheck %s -check-prefix=INV
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN: -mprintf-kind=none -o - %s 2>&1| FileCheck %s -check-prefix=NONE
// HOSTCALL: !{{.*}} = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
// BUFFERED: !{{.*}} = !{i32 1, !"amdgpu_printf_kind", !"buffered"}
-// INV: error: invalid value 'none' in '-mprintf-kind=none'
+// NONE: !{{.*}} = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-printf.cl b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
index edf6dbf8657cbe5..c6cab062a45618e 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-printf.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
@@ -1,5 +1,6 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -mprintf-kind=buffered -disable-llvm-passes -emit-llvm -o - %s | FileCheck --check-prefix=CHECK_BUFFERED %s
+// RUN: %clang_cc1 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -mprintf-kind=hostcall -disable-llvm-passes -emit-llvm -o - %s | FileCheck --check-prefix=CHECK_HOSTCALL %s
int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
@@ -7,6 +8,61 @@ int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)))
// CHECK-NEXT: entry:
// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR4:[0-9]+]]
// CHECK-NEXT: ret void
+// CHECK_BUFFERED-LABEL: @test_printf_noargs(
+// CHECK_BUFFERED-NEXT: entry:
+// CHECK_BUFFERED-NEXT: br i1 icmp eq (ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr null), label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]]
+// CHECK_BUFFERED: strlen.while:
+// CHECK_BUFFERED-NEXT: [[TMP0:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str to ptr), [[ENTRY:%.*]] ], [ [[TMP1:%.*]], [[STRLEN_WHILE]] ]
+// CHECK_BUFFERED-NEXT: [[TMP1]] = getelementptr i8, ptr [[TMP0]], i64 1
+// CHECK_BUFFERED-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP0]], align 1
+// CHECK_BUFFERED-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0
+// CHECK_BUFFERED-NEXT: br i1 [[TMP3]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]]
+// CHECK_BUFFERED: strlen.while.done:
+// CHECK_BUFFERED-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK_BUFFERED-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str to ptr) to i64)
+// CHECK_BUFFERED-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 1
+// CHECK_BUFFERED-NEXT: br label [[STRLEN_JOIN]]
+// CHECK_BUFFERED: strlen.join:
+// CHECK_BUFFERED-NEXT: [[TMP7:%.*]] = phi i64 [ [[TMP6]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ]
+// CHECK_BUFFERED-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 7
+// CHECK_BUFFERED-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 4294967288
+// CHECK_BUFFERED-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 4
+// CHECK_BUFFERED-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
+// CHECK_BUFFERED-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 [[TMP11]])
+// CHECK_BUFFERED-NEXT: [[TMP12:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
+// CHECK_BUFFERED-NEXT: br i1 [[TMP12]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]]
+// CHECK_BUFFERED: end.block:
+// CHECK_BUFFERED-NEXT: [[TMP13:%.*]] = xor i1 [[TMP12]], true
+// CHECK_BUFFERED-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP13]] to i32
+// CHECK_BUFFERED-NEXT: ret void
+// CHECK_BUFFERED: argpush.block:
+// CHECK_BUFFERED-NEXT: [[TMP14:%.*]] = shl i32 [[TMP11]], 2
+// CHECK_BUFFERED-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4
+// CHECK_BUFFERED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
+// CHECK_BUFFERED-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 1 [[TMP15]], ptr align 1 addrspacecast (ptr addrspace(4) @.str to ptr), i64 [[TMP7]], i1 false)
+// CHECK_BUFFERED-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP15]], i64 [[TMP9]]
+// CHECK_BUFFERED-NEXT: br label [[END_BLOCK]]
+//
+// CHECK_HOSTCALL-LABEL: @test_printf_noargs(
+// CHECK_HOSTCALL-NEXT: entry:
+// CHECK_HOSTCALL-NEXT: [[TMP0:%.*]] = call i64 @__ockl_printf_begin(i64 0)
+// CHECK_HOSTCALL-NEXT: br i1 icmp eq (ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr null), label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]]
+// CHECK_HOSTCALL: strlen.while:
+// CHECK_HOSTCALL-NEXT: [[TMP1:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str to ptr), [[ENTRY:%.*]] ], [ [[TMP2:%.*]], [[STRLEN_WHILE]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP2]] = getelementptr i8, ptr [[TMP1]], i64 1
+// CHECK_HOSTCALL-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP1]], align 1
+// CHECK_HOSTCALL-NEXT: [[TMP4:%.*]] = icmp eq i8 [[TMP3]], 0
+// CHECK_HOSTCALL-NEXT: br i1 [[TMP4]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]]
+// CHECK_HOSTCALL: strlen.while.done:
+// CHECK_HOSTCALL-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP1]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str to ptr) to i64)
+// CHECK_HOSTCALL-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 1
+// CHECK_HOSTCALL-NEXT: br label [[STRLEN_JOIN]]
+// CHECK_HOSTCALL: strlen.join:
+// CHECK_HOSTCALL-NEXT: [[TMP8:%.*]] = phi i64 [ [[TMP7]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP9:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP0]], ptr addrspacecast (ptr addrspace(4) @.str to ptr), i64 [[TMP8]], i32 1)
+// CHECK_HOSTCALL-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+// CHECK_HOSTCALL-NEXT: ret void
//
__kernel void test_printf_noargs() {
printf("");
@@ -19,6 +75,53 @@ __kernel void test_printf_noargs() {
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]]
// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.1, i32 noundef [[TMP0]]) #[[ATTR4]]
// CHECK-NEXT: ret void
+// CHECK_BUFFERED-LABEL: @test_printf_int(
+// CHECK_BUFFERED-NEXT: entry:
+// CHECK_BUFFERED-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK_BUFFERED-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA12:![0-9]+]]
+// CHECK_BUFFERED-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA12]]
+// CHECK_BUFFERED-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 20)
+// CHECK_BUFFERED-NEXT: [[TMP1:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
+// CHECK_BUFFERED-NEXT: br i1 [[TMP1]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]]
+// CHECK_BUFFERED: end.block:
+// CHECK_BUFFERED-NEXT: [[TMP2:%.*]] = xor i1 [[TMP1]], true
+// CHECK_BUFFERED-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP2]] to i32
+// CHECK_BUFFERED-NEXT: ret void
+// CHECK_BUFFERED: argpush.block:
+// CHECK_BUFFERED-NEXT: store i32 82, ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4
+// CHECK_BUFFERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
+// CHECK_BUFFERED-NEXT: store i64 -2582314622382785113, ptr addrspace(1) [[TMP3]], align 8
+// CHECK_BUFFERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP3]], i32 8
+// CHECK_BUFFERED-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK_BUFFERED-NEXT: store i64 [[TMP5]], ptr addrspace(1) [[TMP4]], align 8
+// CHECK_BUFFERED-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 8
+// CHECK_BUFFERED-NEXT: br label [[END_BLOCK]]
+//
+// CHECK_HOSTCALL-LABEL: @test_printf_int(
+// CHECK_HOSTCALL-NEXT: entry:
+// CHECK_HOSTCALL-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK_HOSTCALL-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9:![0-9]+]]
+// CHECK_HOSTCALL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]]
+// CHECK_HOSTCALL-NEXT: [[TMP1:%.*]] = call i64 @__ockl_printf_begin(i64 0)
+// CHECK_HOSTCALL-NEXT: br i1 icmp eq (ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr), ptr null), label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]]
+// CHECK_HOSTCALL: strlen.while:
+// CHECK_HOSTCALL-NEXT: [[TMP2:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str.1 to ptr), [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[STRLEN_WHILE]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP3]] = getelementptr i8, ptr [[TMP2]], i64 1
+// CHECK_HOSTCALL-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1
+// CHECK_HOSTCALL-NEXT: [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
+// CHECK_HOSTCALL-NEXT: br i1 [[TMP5]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]]
+// CHECK_HOSTCALL: strlen.while.done:
+// CHECK_HOSTCALL-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP2]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr) to i64)
+// CHECK_HOSTCALL-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 1
+// CHECK_HOSTCALL-NEXT: br label [[STRLEN_JOIN]]
+// CHECK_HOSTCALL: strlen.join:
+// CHECK_HOSTCALL-NEXT: [[TMP9:%.*]] = phi i64 [ [[TMP8]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP10:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP1]], ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr), i64 [[TMP9]], i32 0)
+// CHECK_HOSTCALL-NEXT: [[TMP11:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP12:%.*]] = call i64 @__ockl_printf_append_args(i64 [[TMP10]], i32 1, i64 [[TMP11]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1)
+// CHECK_HOSTCALL-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32
+// CHECK_HOSTCALL-NEXT: ret void
//
__kernel void test_printf_int(int i) {
printf("%d", i);
@@ -36,6 +139,106 @@ __kernel void test_printf_int(int i) {
// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.2, ptr addrspace(5) noundef [[ARRAYDECAY]], i32 noundef [[TMP2]]) #[[ATTR4]]
// CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR5]]
// CHECK-NEXT: ret void
+// CHECK_BUFFERED-LABEL: @test_printf_str_int(
+// CHECK_BUFFERED-NEXT: entry:
+// CHECK_BUFFERED-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK_BUFFERED-NEXT: [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5)
+// CHECK_BUFFERED-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA12]]
+// CHECK_BUFFERED-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR1:[0-9]+]]
+// CHECK_BUFFERED-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 [[S]], ptr addrspace(4) align 1 @__const.test_printf_str_int.s, i64 4, i1 false)
+// CHECK_BUFFERED-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(5) [[S]], i64 0, i64 0
+// CHECK_BUFFERED-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA12]]
+// CHECK_BUFFERED-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[ARRAYDECAY]] to ptr
+// CHECK_BUFFERED-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null
+// CHECK_BUFFERED-NEXT: br i1 [[TMP2]], label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]]
+// CHECK_BUFFERED: strlen.while:
+// CHECK_BUFFERED-NEXT: [[TMP3:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[STRLEN_WHILE]] ]
+// CHECK_BUFFERED-NEXT: [[TMP4]] = getelementptr i8, ptr [[TMP3]], i64 1
+// CHECK_BUFFERED-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1
+// CHECK_BUFFERED-NEXT: [[TMP6:%.*]] = icmp eq i8 [[TMP5]], 0
+// CHECK_BUFFERED-NEXT: br i1 [[TMP6]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]]
+// CHECK_BUFFERED: strlen.while.done:
+// CHECK_BUFFERED-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP1]] to i64
+// CHECK_BUFFERED-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP3]] to i64
+// CHECK_BUFFERED-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[TMP7]]
+// CHECK_BUFFERED-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 1
+// CHECK_BUFFERED-NEXT: br label [[STRLEN_JOIN]]
+// CHECK_BUFFERED: strlen.join:
+// CHECK_BUFFERED-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP10]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ]
+// CHECK_BUFFERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 7
+// CHECK_BUFFERED-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 4294967288
+// CHECK_BUFFERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 20
+// CHECK_BUFFERED-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+// CHECK_BUFFERED-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 [[TMP15]])
+// CHECK_BUFFERED-NEXT: [[TMP16:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
+// CHECK_BUFFERED-NEXT: br i1 [[TMP16]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]]
+// CHECK_BUFFERED: end.block:
+// CHECK_BUFFERED-NEXT: [[TMP17:%.*]] = xor i1 [[TMP16]], true
+// CHECK_BUFFERED-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP17]] to i32
+// CHECK_BUFFERED-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR1]]
+// CHECK_BUFFERED-NEXT: ret void
+// CHECK_BUFFERED: argpush.block:
+// CHECK_BUFFERED-NEXT: [[TMP18:%.*]] = shl i32 [[TMP15]], 2
+// CHECK_BUFFERED-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], 2
+// CHECK_BUFFERED-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4
+// CHECK_BUFFERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
+// CHECK_BUFFERED-NEXT: store i64 -2942283388077972797, ptr addrspace(1) [[TMP20]], align 8
+// CHECK_BUFFERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i32 8
+// CHECK_BUFFERED-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 1 [[TMP21]], ptr align 1 [[TMP1]], i64 [[TMP11]], i1 false)
+// CHECK_BUFFERED-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP21]], i64 [[TMP13]]
+// CHECK_BUFFERED-NEXT: [[TMP22:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK_BUFFERED-NEXT: store i64 [[TMP22]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8
+// CHECK_BUFFERED-NEXT: [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8
+// CHECK_BUFFERED-NEXT: br label [[END_BLOCK]]
+//
+// CHECK_HOSTCALL-LABEL: @test_printf_str_int(
+// CHECK_HOSTCALL-NEXT: entry:
+// CHECK_HOSTCALL-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK_HOSTCALL-NEXT: [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5)
+// CHECK_HOSTCALL-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]]
+// CHECK_HOSTCALL-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR3:[0-9]+]]
+// CHECK_HOSTCALL-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 [[S]], ptr addrspace(4) align 1 @__const.test_printf_str_int.s, i64 4, i1 false)
+// CHECK_HOSTCALL-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(5) [[S]], i64 0, i64 0
+// CHECK_HOSTCALL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]]
+// CHECK_HOSTCALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[ARRAYDECAY]] to ptr
+// CHECK_HOSTCALL-NEXT: [[TMP2:%.*]] = call i64 @__ockl_printf_begin(i64 0)
+// CHECK_HOSTCALL-NEXT: br i1 icmp eq (ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr), ptr null), label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]]
+// CHECK_HOSTCALL: strlen.while:
+// CHECK_HOSTCALL-NEXT: [[TMP3:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str.2 to ptr), [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[STRLEN_WHILE]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP4]] = getelementptr i8, ptr [[TMP3]], i64 1
+// CHECK_HOSTCALL-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1
+// CHECK_HOSTCALL-NEXT: [[TMP6:%.*]] = icmp eq i8 [[TMP5]], 0
+// CHECK_HOSTCALL-NEXT: br i1 [[TMP6]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]]
+// CHECK_HOSTCALL: strlen.while.done:
+// CHECK_HOSTCALL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP3]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr) to i64)
+// CHECK_HOSTCALL-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 1
+// CHECK_HOSTCALL-NEXT: br label [[STRLEN_JOIN]]
+// CHECK_HOSTCALL: strlen.join:
+// CHECK_HOSTCALL-NEXT: [[TMP10:%.*]] = phi i64 [ [[TMP9]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP11:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP2]], ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr), i64 [[TMP10]], i32 0)
+// CHECK_HOSTCALL-NEXT: [[TMP12:%.*]] = icmp eq ptr [[TMP1]], null
+// CHECK_HOSTCALL-NEXT: br i1 [[TMP12]], label [[STRLEN_JOIN1:%.*]], label [[STRLEN_WHILE2:%.*]]
+// CHECK_HOSTCALL: strlen.while2:
+// CHECK_HOSTCALL-NEXT: [[TMP13:%.*]] = phi ptr [ [[TMP1]], [[STRLEN_JOIN]] ], [ [[TMP14:%.*]], [[STRLEN_WHILE2]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP14]] = getelementptr i8, ptr [[TMP13]], i64 1
+// CHECK_HOSTCALL-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP13]], align 1
+// CHECK_HOSTCALL-NEXT: [[TMP16:%.*]] = icmp eq i8 [[TMP15]], 0
+// CHECK_HOSTCALL-NEXT: br i1 [[TMP16]], label [[STRLEN_WHILE_DONE3:%.*]], label [[STRLEN_WHILE2]]
+// CHECK_HOSTCALL: strlen.while.done3:
+// CHECK_HOSTCALL-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP1]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP19:%.*]] = sub i64 [[TMP18]], [[TMP17]]
+// CHECK_HOSTCALL-NEXT: [[TMP20:%.*]] = add i64 [[TMP19]], 1
+// CHECK_HOSTCALL-NEXT: br label [[STRLEN_JOIN1]]
+// CHECK_HOSTCALL: strlen.join1:
+// CHECK_HOSTCALL-NEXT: [[TMP21:%.*]] = phi i64 [ [[TMP20]], [[STRLEN_WHILE_DONE3]] ], [ 0, [[STRLEN_JOIN]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP22:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP11]], ptr [[TMP1]], i64 [[TMP21]], i32 0)
+// CHECK_HOSTCALL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP24:%.*]] = call i64 @__ockl_printf_append_args(i64 [[TMP22]], i32 1, i64 [[TMP23]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1)
+// CHECK_HOSTCALL-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
+// CHECK_HOSTCALL-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR3]]
+// CHECK_HOSTCALL-NEXT: ret void
//
__kernel void test_printf_str_int(int i) {
char s[] = "foo";
diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
index 2195406c144c8ba..f48721b6acd0063 100644
--- a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
+++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -26,28 +26,31 @@ using namespace llvm;
#define DEBUG_TYPE "amdgpu-emit-printf"
-static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) {
+static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg,
+ bool IsBuffered) {
+ const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
auto Int64Ty = Builder.getInt64Ty();
auto Ty = Arg->getType();
if (auto IntTy = dyn_cast<IntegerType>(Ty)) {
- switch (IntTy->getBitWidth()) {
- case 32:
- return Builder.CreateZExt(Arg, Int64Ty);
- case 64:
- return Arg;
+ if (IntTy->getBitWidth() < 64) {
+ return Builder.CreateZExt(Arg, Builder.getInt64Ty());
}
}
- if (Ty->getTypeID() == Type::DoubleTyID) {
+ if (Ty->isFloatingPointTy()) {
+ if (DL.getTypeAllocSize(Ty) < 8)
+ Arg = Builder.CreateFPExt(Arg, Builder.getDoubleTy());
+ if (IsBuffered)
+ return Arg;
return Builder.CreateBitCast(Arg, Int64Ty);
}
- if (isa<PointerType>(Ty)) {
+ if (!IsBuffered && isa<PointerType>(Ty)) {
return Builder.CreatePtrToInt(Arg, Int64Ty);
}
- llvm_unreachable("unexpected type");
+ return Arg;
}
static Value *callPrintfBegin(IRBuilder<> &Builder, Value *Version) {
@@ -74,8 +77,8 @@ static Value *callAppendArgs(IRBuilder<> &Builder, Value *Desc, int NumArgs,
}
static Value *appendArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
- bool IsLast) {
- auto Arg0 = fitArgInto64Bits(Builder, Arg);
+ bool IsLast, bool IsBuffered) {
+ auto Arg0 = fitArgInto64Bits(Builder, Arg, IsBuffered);
auto Zero = Builder.getInt64(0);
return callAppendArgs(Builder, Desc, 1, Arg0, Zero, Zero, Zero, Zero, Zero,
Zero, IsLast);
@@ -170,20 +173,46 @@ static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg,
return callAppendStringN(Builder, Desc, Arg, Length, IsLast);
}
+static Value *appendVectorArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+ bool IsLast, bool IsBuffered) {
+ assert(Arg->getType()->isVectorTy() && "incorrent append* function");
+ auto VectorTy = dyn_cast<FixedVectorType>(Arg->getType());
+ auto Zero = Builder.getInt64(0);
+ if (VectorTy) {
+ for (unsigned int i = 0; i < VectorTy->getNumElements() - 1; i++) {
+ auto Val = Builder.CreateExtractElement(Arg, i);
+ Desc = callAppendArgs(Builder, Desc, 1,
+ fitArgInto64Bits(Builder, Val, IsBuffered), Zero,
+ Zero, Zero, Zero, Zero, Zero, false);
+ }
+
+ auto Val =
+ Builder.CreateExtractElement(Arg, VectorTy->getNumElements() - 1);
+ return callAppendArgs(Builder, Desc, 1,
+ fitArgInto64Bits(Builder, Val, IsBuffered), Zero,
+ Zero, Zero, Zero, Zero, Zero, IsLast);
+ }
+ return nullptr;
+}
+
static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
- bool SpecIsCString, bool IsLast) {
+ bool SpecIsCString, bool IsVector, bool IsLast,
+ bool IsBuffered) {
if (SpecIsCString && isa<PointerType>(Arg->getType())) {
return appendString(Builder, Desc, Arg, IsLast);
- }
- // If the format specifies a string but the argument is not, the frontend will
- // have printed a warning. We just rely on undefined behaviour and send the
- // argument anyway.
- return appendArg(Builder, Desc, Arg, IsLast);
+ } else if (IsVector) {
+ return appendVectorArg(Builder, Desc, Arg, IsLast, IsBuffered);
+ } else
+ // If the format specifies a string but the argument is not, the frontend
+ // will have printed a warning. We just rely on undefined behaviour and send
+ // the argument anyway.
+ return appendArg(Builder, Desc, Arg, IsLast, IsBuffered);
}
// Scan the format string to locate all specifiers, and mark the ones that
// specify a string, i.e, the "%s" specifier with optional '*' characters.
-static void locateCStrings(SparseBitVector<8> &BV, StringRef Str) {
+static void locateCStringsAndVectors(SparseBitVector<8> &BV,
+ SparseBitVector<8> &OV, StringRef Str) {
static const char ConvSpecifiers[] = "diouxXfFeEgGaAcspn";
size_t SpecPos = 0;
// Skip the first argument, the format string.
@@ -194,6 +223,8 @@ static void locateCStrings(SparseBitVector<8> &BV, StringRef Str) {
SpecPos += 2;
continue;
}
+ if (Str.find_first_of("v", SpecPos) != StringRef::npos)
+ OV.set(ArgIdx);
auto SpecEnd = Str.find_first_of(ConvSpecifiers, SpecPos);
if (SpecEnd == StringRef::npos)
return;
@@ -224,7 +255,8 @@ struct StringData {
static Value *callBufferedPrintfStart(
IRBuilder<> &Builder, ArrayRef<Value *> Args, Value *Fmt,
bool isConstFmtStr, SparseBitVector<8> &SpecIsCString,
- SmallVectorImpl<StringData> &StringContents, Value *&ArgSize) {
+ SparseBitVector<8> &OCLVectors, SmallVectorImpl<StringData> &StringContents,
+ Value *&ArgSize) {
Module *M = Builder.GetInsertBlock()->getModule();
Value *NonConstStrLen = nullptr;
Value *LenWithNull = nullptr;
@@ -278,7 +310,13 @@ static Value *callBufferedPrintfStart(
StringData(StringRef(), LenWithNull, LenWithNullAligned, false));
}
} else {
- int AllocSize = M->getDataLayout().getTypeAllocSize(Args[i]->getType());
+ int AllocSize = 0;
+ if (OCLVectors.test(i)) {
+ auto VecArg = dyn_cast<FixedVectorType>(Args[i]->getType());
+ assert(VecArg && "invalid vector specifier");
+ AllocSize = VecArg->getNumElements() * 8;
+ } else
+ AllocSize = M->getDataLayout().getTypeAllocSize(Args[i]->getType());
// We end up expanding non string arguments to 8 bytes
// (args smaller than 8 bytes)
BufSize += std::max(AllocSize, 8);
@@ -352,30 +390,10 @@ static void processConstantStringArg(StringData *SD, IRBuilder<> &Builder,
WhatToStore.push_back(ConstantInt::get(Builder.getInt32Ty(), 0));
}
-static Value *processNonStringArg(Value *Arg, IRBuilder<> &Builder) {
- const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
- auto Ty = Arg->getType();
-
- if (auto IntTy = dyn_cast<IntegerType>(Ty)) {
- if (IntTy->getBitWidth() < 64) {
- return Builder.CreateZExt(Arg, Builder.getInt64Ty());
- }
- }
-
- if (Ty->isFloatingPointTy()) {
- if (DL.getTypeAllocSize(Ty) < 8) {
- return Builder.CreateFPExt(Arg, Builder.getDoubleTy());
- }
- }
-
- return Arg;
-}
-
-static void
-callBufferedPrintfArgPush(IRBuilder<> &Builder, ArrayRef<Value *> Args,
- Value *PtrToStore, SparseBitVector<8> &SpecIsCString,
- SmallVectorImpl<StringData> &StringContents,
- bool IsConstFmtStr) {
+static void callBufferedPrintfArgPush(
+ IRBuilder<> &Builder, ArrayRef<Value *> Args, Value *PtrToStore,
+ SparseBitVector<8> &SpecIsCString, SparseBitVector<8> &OCLVectors,
+ SmallVectorImpl<StringData> &StringContents, bool IsConstFmtStr) {
Module *M = Builder.GetInsertBlock()->getModule();
const DataLayout &DL = M->getDataLayout();
auto StrIt = StringContents.begin();
@@ -407,7 +425,17 @@ callBufferedPrintfArgPush(IRBuilder<> &Builder, ArrayRef<Value *> Args,
continue;
}
} else {
- WhatToStore.push_back(processNonStringArg(Args[i], Builder));
+ if (OCLVectors.test(i)) {
+ auto VectorTy = dyn_cast<FixedVectorType>(Args[i]->getType());
+ auto VecArg = Args[i];
+ for (unsigned int Num = 0; Num < VectorTy->getNumElements(); Num++) {
+ auto Val = Builder.CreateExtractElement(VecArg, Num);
+ WhatToStore.push_back(
+ fitArgInto64Bits(Builder, Val, /*IsBuffered*/ true));
+ }
+ } else
+ WhatToStore.push_back(
+ fitArgInto64Bits(Builder, Args[i], /*IsBuffered*/ true));
}
for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) {
@@ -434,10 +462,11 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args,
auto Fmt = Args[0];
SparseBitVector<8> SpecIsCString;
+ SparseBitVector<8> OCLVectors;
StringRef FmtStr;
if (getConstantStringInfo(Fmt, FmtStr))
- locateCStrings(SpecIsCString, FmtStr);
+ locateCStringsAndVectors(SpecIsCString, OCLVectors, FmtStr);
if (IsBuffered) {
SmallVector<StringData, 8> StringContents;
@@ -448,9 +477,9 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args,
bool IsConstFmtStr = !FmtStr.empty();
Value *ArgSize = nullptr;
- Value *Ptr =
- callBufferedPrintfStart(Builder, Args, Fmt, IsConstFmtStr,
- SpecIsCString, StringContents, ArgSize);
+ Value *Ptr = callBufferedPrintfStart(Builder, Args, Fmt, IsConstFmtStr,
+ SpecIsCString, OCLVectors,
+ StringContents, ArgSize);
// The buffered version still follows OpenCL printf standards for
// printf return value, i.e 0 on success, -1 on failure.
@@ -513,8 +542,8 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args,
}
// Push The printf arguments onto buffer
- callBufferedPrintfArgPush(Builder, Args, Ptr, SpecIsCString, StringContents,
- IsConstFmtStr);
+ callBufferedPrintfArgPush(Builder, Args, Ptr, SpecIsCString, OCLVectors,
+ StringContents, IsConstFmtStr);
// End block, returns -1 on failure
BranchInst::Create(End, ArgPush);
@@ -531,7 +560,9 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args,
for (unsigned int i = 1; i != NumOps; ++i) {
bool IsLast = i == NumOps - 1;
bool IsCString = SpecIsCString.test(i);
- Desc = processArg(Builder, Desc, Args[i], IsCString, IsLast);
+ bool IsVector = OCLVectors.test(i);
+ Desc = processArg(Builder, Desc, Args[i], IsCString, IsVector, IsLast,
+ IsBuffered);
}
return Builder.CreateTrunc(Desc, Builder.getInt32Ty());
More information about the llvm-commits
mailing list