[clang] [llvm] Enable OpenCL hostcall printf (WIP) (PR #72556)
Vikram Hegde via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 19 21:30:26 PST 2023
https://github.com/vikramRH updated https://github.com/llvm/llvm-project/pull/72556
>From 6ace9d0a51064be189093ca3bb42416aafadb7f6 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Fri, 10 Nov 2023 09:39:41 +0000
Subject: [PATCH 1/3] [AMDGPU] Treat printf as builtin for OpenCL
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 8 ++++++++
clang/lib/AST/Decl.cpp | 7 +++++++
clang/lib/Basic/Targets/AMDGPU.cpp | 2 ++
clang/lib/CodeGen/CGBuiltin.cpp | 5 +++++
4 files changed, 22 insertions(+)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index a19c8bd5f219ec6..1799c72806bfdd4 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -21,6 +21,10 @@
#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
# define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
#endif
+
+#if defined(BUILTIN) && !defined(LANGBUILTIN)
+#define LANGBUILTIN(ID, TYPE, ATTRS, BUILTIN_LANG) BUILTIN(ID, TYPE, ATTRS)
+#endif
//===----------------------------------------------------------------------===//
// SI+ only builtins.
//===----------------------------------------------------------------------===//
@@ -406,5 +410,9 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts")
+// OpenCL
+LANGBUILTIN(printf, "icC*4.", "fp:0:", ALL_OCL_LANGUAGES)
+
#undef BUILTIN
#undef TARGET_BUILTIN
+#undef LANGBUILTIN
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index c5c2edf1bfe3aba..2597422bdd521a0 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -49,6 +49,7 @@
#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Basic/Specifiers.h"
+#include "clang/Basic/TargetBuiltins.h"
#include "clang/Basic/TargetCXXABI.h"
#include "clang/Basic/TargetInfo.h"
#include "clang/Basic/Visibility.h"
@@ -3598,6 +3599,12 @@ unsigned FunctionDecl::getBuiltinID(bool ConsiderWrapperFunctions) const {
if (!ConsiderWrapperFunctions && getStorageClass() == SC_Static)
return 0;
+ // AMDGCN implementation supports printf as a builtin
+ // for OpenCL
+ if (Context.getTargetInfo().getTriple().isAMDGCN() &&
+ Context.getLangOpts().OpenCL && BuiltinID == AMDGPU::BIprintf)
+ return BuiltinID;
+
// OpenCL v1.2 s6.9.f - The library functions defined in
// the C99 standard headers are not available.
if (Context.getLangOpts().OpenCL &&
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 409ae32ab424215..307cfa49f54e926 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -91,6 +91,8 @@ static constexpr Builtin::Info BuiltinInfo[] = {
{#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \
{#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
+#define LANGBUILTIN(ID, TYPE, ATTRS, LANG) \
+ {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, LANG},
#include "clang/Basic/BuiltinsAMDGPU.def"
};
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 09309a3937fb613..987909b5a62e11b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2458,6 +2458,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
&getTarget().getLongDoubleFormat() == &llvm::APFloat::IEEEquad())
BuiltinID = mutateLongDoubleBuiltin(BuiltinID);
+ // Mutate the printf builtin ID so that we use the same CodeGen path for
+ // HIP and OpenCL with AMDGPU targets.
+ if (getTarget().getTriple().isAMDGCN() && BuiltinID == AMDGPU::BIprintf)
+ BuiltinID = Builtin::BIprintf;
+
// If the builtin has been declared explicitly with an assembler label,
// disable the specialized emitting below. Ideally we should communicate the
// rename in IR, or at least avoid generating the intrinsic calls that are
>From 040a28deef5fe7a5d9e357a898b50335992e708d Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 20 Nov 2023 05:26:27 +0000
Subject: [PATCH 2/3] [AMDGPU] Enable OpenCL printf expansion at clang CodeGen
---
clang/lib/CodeGen/CGBuiltin.cpp | 3 ++-
clang/lib/CodeGen/CGGPUBuiltin.cpp | 25 +++++++++++++++++++------
clang/lib/Driver/ToolChains/Clang.cpp | 10 ++++++++++
3 files changed, 31 insertions(+), 7 deletions(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 987909b5a62e11b..8d51df24c7872b7 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5622,7 +5622,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
return EmitOpenMPDevicePrintfCallExpr(E);
if (getTarget().getTriple().isNVPTX())
return EmitNVPTXDevicePrintfCallExpr(E);
- if (getTarget().getTriple().isAMDGCN() && getLangOpts().HIP)
+ if (getTarget().getTriple().isAMDGCN() &&
+ (getLangOpts().HIP || getLangOpts().OpenCL))
return EmitAMDGPUDevicePrintfCallExpr(E);
}
diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp b/clang/lib/CodeGen/CGGPUBuiltin.cpp
index de4ee68c0da1e79..81e23bc325339bb 100644
--- a/clang/lib/CodeGen/CGGPUBuiltin.cpp
+++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp
@@ -13,6 +13,7 @@
#include "CodeGenFunction.h"
#include "clang/Basic/Builtins.h"
+#include "clang/Basic/TargetBuiltins.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Instruction.h"
#include "llvm/Support/MathExtras.h"
@@ -177,10 +178,20 @@ RValue CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E) {
E, this, GetVprintfDeclaration(CGM.getModule()), false);
}
+// Deterimines if an argument is a string
+static bool isString(const clang::Type *argXTy) {
+ if ((argXTy->isPointerType() || argXTy->isConstantArrayType()) &&
+ argXTy->getPointeeOrArrayElementType()->isCharType())
+ return true;
+ else
+ return false;
+}
+
RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) {
assert(getTarget().getTriple().getArch() == llvm::Triple::amdgcn);
assert(E->getBuiltinCallee() == Builtin::BIprintf ||
- E->getBuiltinCallee() == Builtin::BI__builtin_printf);
+ E->getBuiltinCallee() == Builtin::BI__builtin_printf ||
+ E->getBuiltinCallee() == AMDGPU::BIprintf);
assert(E->getNumArgs() >= 1); // printf always has at least one arg.
CallArgList CallArgs;
@@ -188,6 +199,8 @@ RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) {
E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
E->arguments(), E->getDirectCallee(),
/* ParamsToSkip = */ 0);
+ llvm::IRBuilder<> IRB(Builder.GetInsertBlock(), Builder.GetInsertPoint());
+ IRB.SetCurrentDebugLocation(Builder.getCurrentDebugLocation());
SmallVector<llvm::Value *, 8> Args;
for (const auto &A : CallArgs) {
@@ -198,14 +211,14 @@ RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) {
}
llvm::Value *Arg = A.getRValue(*this).getScalarVal();
+ if (isString(A.getType().getTypePtr()) && CGM.getLangOpts().OpenCL)
+ Arg = Builder.CreateAddrSpaceCast(Arg, CGM.Int8PtrTy);
Args.push_back(Arg);
}
- llvm::IRBuilder<> IRB(Builder.GetInsertBlock(), Builder.GetInsertPoint());
- IRB.SetCurrentDebugLocation(Builder.getCurrentDebugLocation());
-
- bool isBuffered = (CGM.getTarget().getTargetOpts().AMDGPUPrintfKindVal ==
- clang::TargetOptions::AMDGPUPrintfKind::Buffered);
+ auto PFK = CGM.getTarget().getTargetOpts().AMDGPUPrintfKindVal;
+ bool isBuffered =
+ (PFK == clang::TargetOptions::AMDGPUPrintfKind::Buffered);
auto Printf = llvm::emitAMDGPUPrintfCall(IRB, Args, isBuffered);
Builder.SetInsertPoint(IRB.GetInsertBlock(), IRB.GetInsertPoint());
return RValue::get(Printf);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b462f5a44057d94..b63c777fd1f158c 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4742,6 +4742,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.ClaimAllArgs(options::OPT_gen_cdb_fragment_path);
}
+ if (TC.getTriple().isAMDGPU() && types::isOpenCL(Input.getType())) {
+ if (Args.getLastArg(options::OPT_mprintf_kind_EQ)) {
+ CmdArgs.push_back(Args.MakeArgString(
+ "-mprintf-kind=" +
+ Args.getLastArgValue(options::OPT_mprintf_kind_EQ)));
+ // Force compiler error on invalid conversion specifiers
+ CmdArgs.push_back(Args.MakeArgString("-Werror=format-invalid-specifier"));
+ }
+ }
+
if (IsCuda || IsHIP) {
// We have to pass the triple of the host if compiling for a CUDA/HIP device
// and vice-versa.
>From b443e11ee074c5ec89cb1072583bad6c8ffaa897 Mon Sep 17 00:00:00 2001
From: Vikram <Vikram.Hegde at amd.com>
Date: Mon, 20 Nov 2023 05:28:10 +0000
Subject: [PATCH 3/3] [AMDGPU] Add vector processing support to AMDGPU printf
---
clang/test/CodeGenOpenCL/amdgpu-printf.cl | 205 +++++++++++++++++-
.../lib/Transforms/Utils/AMDGPUEmitPrintf.cpp | 135 +++++++-----
2 files changed, 288 insertions(+), 52 deletions(-)
diff --git a/clang/test/CodeGenOpenCL/amdgpu-printf.cl b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
index edf6dbf8657cbe5..c6cab062a45618e 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-printf.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
@@ -1,5 +1,6 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -mprintf-kind=buffered -disable-llvm-passes -emit-llvm -o - %s | FileCheck --check-prefix=CHECK_BUFFERED %s
+// RUN: %clang_cc1 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -mprintf-kind=hostcall -disable-llvm-passes -emit-llvm -o - %s | FileCheck --check-prefix=CHECK_HOSTCALL %s
int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
@@ -7,6 +8,61 @@ int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)))
// CHECK-NEXT: entry:
// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR4:[0-9]+]]
// CHECK-NEXT: ret void
+// CHECK_BUFFERED-LABEL: @test_printf_noargs(
+// CHECK_BUFFERED-NEXT: entry:
+// CHECK_BUFFERED-NEXT: br i1 icmp eq (ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr null), label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]]
+// CHECK_BUFFERED: strlen.while:
+// CHECK_BUFFERED-NEXT: [[TMP0:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str to ptr), [[ENTRY:%.*]] ], [ [[TMP1:%.*]], [[STRLEN_WHILE]] ]
+// CHECK_BUFFERED-NEXT: [[TMP1]] = getelementptr i8, ptr [[TMP0]], i64 1
+// CHECK_BUFFERED-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP0]], align 1
+// CHECK_BUFFERED-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0
+// CHECK_BUFFERED-NEXT: br i1 [[TMP3]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]]
+// CHECK_BUFFERED: strlen.while.done:
+// CHECK_BUFFERED-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK_BUFFERED-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str to ptr) to i64)
+// CHECK_BUFFERED-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 1
+// CHECK_BUFFERED-NEXT: br label [[STRLEN_JOIN]]
+// CHECK_BUFFERED: strlen.join:
+// CHECK_BUFFERED-NEXT: [[TMP7:%.*]] = phi i64 [ [[TMP6]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ]
+// CHECK_BUFFERED-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 7
+// CHECK_BUFFERED-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 4294967288
+// CHECK_BUFFERED-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 4
+// CHECK_BUFFERED-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
+// CHECK_BUFFERED-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 [[TMP11]])
+// CHECK_BUFFERED-NEXT: [[TMP12:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
+// CHECK_BUFFERED-NEXT: br i1 [[TMP12]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]]
+// CHECK_BUFFERED: end.block:
+// CHECK_BUFFERED-NEXT: [[TMP13:%.*]] = xor i1 [[TMP12]], true
+// CHECK_BUFFERED-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP13]] to i32
+// CHECK_BUFFERED-NEXT: ret void
+// CHECK_BUFFERED: argpush.block:
+// CHECK_BUFFERED-NEXT: [[TMP14:%.*]] = shl i32 [[TMP11]], 2
+// CHECK_BUFFERED-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4
+// CHECK_BUFFERED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
+// CHECK_BUFFERED-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 1 [[TMP15]], ptr align 1 addrspacecast (ptr addrspace(4) @.str to ptr), i64 [[TMP7]], i1 false)
+// CHECK_BUFFERED-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP15]], i64 [[TMP9]]
+// CHECK_BUFFERED-NEXT: br label [[END_BLOCK]]
+//
+// CHECK_HOSTCALL-LABEL: @test_printf_noargs(
+// CHECK_HOSTCALL-NEXT: entry:
+// CHECK_HOSTCALL-NEXT: [[TMP0:%.*]] = call i64 @__ockl_printf_begin(i64 0)
+// CHECK_HOSTCALL-NEXT: br i1 icmp eq (ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr null), label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]]
+// CHECK_HOSTCALL: strlen.while:
+// CHECK_HOSTCALL-NEXT: [[TMP1:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str to ptr), [[ENTRY:%.*]] ], [ [[TMP2:%.*]], [[STRLEN_WHILE]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP2]] = getelementptr i8, ptr [[TMP1]], i64 1
+// CHECK_HOSTCALL-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP1]], align 1
+// CHECK_HOSTCALL-NEXT: [[TMP4:%.*]] = icmp eq i8 [[TMP3]], 0
+// CHECK_HOSTCALL-NEXT: br i1 [[TMP4]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]]
+// CHECK_HOSTCALL: strlen.while.done:
+// CHECK_HOSTCALL-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP1]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str to ptr) to i64)
+// CHECK_HOSTCALL-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 1
+// CHECK_HOSTCALL-NEXT: br label [[STRLEN_JOIN]]
+// CHECK_HOSTCALL: strlen.join:
+// CHECK_HOSTCALL-NEXT: [[TMP8:%.*]] = phi i64 [ [[TMP7]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP9:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP0]], ptr addrspacecast (ptr addrspace(4) @.str to ptr), i64 [[TMP8]], i32 1)
+// CHECK_HOSTCALL-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+// CHECK_HOSTCALL-NEXT: ret void
//
__kernel void test_printf_noargs() {
printf("");
@@ -19,6 +75,53 @@ __kernel void test_printf_noargs() {
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]]
// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.1, i32 noundef [[TMP0]]) #[[ATTR4]]
// CHECK-NEXT: ret void
+// CHECK_BUFFERED-LABEL: @test_printf_int(
+// CHECK_BUFFERED-NEXT: entry:
+// CHECK_BUFFERED-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK_BUFFERED-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA12:![0-9]+]]
+// CHECK_BUFFERED-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA12]]
+// CHECK_BUFFERED-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 20)
+// CHECK_BUFFERED-NEXT: [[TMP1:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
+// CHECK_BUFFERED-NEXT: br i1 [[TMP1]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]]
+// CHECK_BUFFERED: end.block:
+// CHECK_BUFFERED-NEXT: [[TMP2:%.*]] = xor i1 [[TMP1]], true
+// CHECK_BUFFERED-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP2]] to i32
+// CHECK_BUFFERED-NEXT: ret void
+// CHECK_BUFFERED: argpush.block:
+// CHECK_BUFFERED-NEXT: store i32 82, ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4
+// CHECK_BUFFERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
+// CHECK_BUFFERED-NEXT: store i64 -2582314622382785113, ptr addrspace(1) [[TMP3]], align 8
+// CHECK_BUFFERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP3]], i32 8
+// CHECK_BUFFERED-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK_BUFFERED-NEXT: store i64 [[TMP5]], ptr addrspace(1) [[TMP4]], align 8
+// CHECK_BUFFERED-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 8
+// CHECK_BUFFERED-NEXT: br label [[END_BLOCK]]
+//
+// CHECK_HOSTCALL-LABEL: @test_printf_int(
+// CHECK_HOSTCALL-NEXT: entry:
+// CHECK_HOSTCALL-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK_HOSTCALL-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9:![0-9]+]]
+// CHECK_HOSTCALL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]]
+// CHECK_HOSTCALL-NEXT: [[TMP1:%.*]] = call i64 @__ockl_printf_begin(i64 0)
+// CHECK_HOSTCALL-NEXT: br i1 icmp eq (ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr), ptr null), label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]]
+// CHECK_HOSTCALL: strlen.while:
+// CHECK_HOSTCALL-NEXT: [[TMP2:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str.1 to ptr), [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[STRLEN_WHILE]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP3]] = getelementptr i8, ptr [[TMP2]], i64 1
+// CHECK_HOSTCALL-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1
+// CHECK_HOSTCALL-NEXT: [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
+// CHECK_HOSTCALL-NEXT: br i1 [[TMP5]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]]
+// CHECK_HOSTCALL: strlen.while.done:
+// CHECK_HOSTCALL-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP2]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr) to i64)
+// CHECK_HOSTCALL-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 1
+// CHECK_HOSTCALL-NEXT: br label [[STRLEN_JOIN]]
+// CHECK_HOSTCALL: strlen.join:
+// CHECK_HOSTCALL-NEXT: [[TMP9:%.*]] = phi i64 [ [[TMP8]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP10:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP1]], ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr), i64 [[TMP9]], i32 0)
+// CHECK_HOSTCALL-NEXT: [[TMP11:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP12:%.*]] = call i64 @__ockl_printf_append_args(i64 [[TMP10]], i32 1, i64 [[TMP11]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1)
+// CHECK_HOSTCALL-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32
+// CHECK_HOSTCALL-NEXT: ret void
//
__kernel void test_printf_int(int i) {
printf("%d", i);
@@ -36,6 +139,106 @@ __kernel void test_printf_int(int i) {
// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.2, ptr addrspace(5) noundef [[ARRAYDECAY]], i32 noundef [[TMP2]]) #[[ATTR4]]
// CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR5]]
// CHECK-NEXT: ret void
+// CHECK_BUFFERED-LABEL: @test_printf_str_int(
+// CHECK_BUFFERED-NEXT: entry:
+// CHECK_BUFFERED-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK_BUFFERED-NEXT: [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5)
+// CHECK_BUFFERED-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA12]]
+// CHECK_BUFFERED-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR1:[0-9]+]]
+// CHECK_BUFFERED-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 [[S]], ptr addrspace(4) align 1 @__const.test_printf_str_int.s, i64 4, i1 false)
+// CHECK_BUFFERED-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(5) [[S]], i64 0, i64 0
+// CHECK_BUFFERED-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA12]]
+// CHECK_BUFFERED-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[ARRAYDECAY]] to ptr
+// CHECK_BUFFERED-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null
+// CHECK_BUFFERED-NEXT: br i1 [[TMP2]], label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]]
+// CHECK_BUFFERED: strlen.while:
+// CHECK_BUFFERED-NEXT: [[TMP3:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[STRLEN_WHILE]] ]
+// CHECK_BUFFERED-NEXT: [[TMP4]] = getelementptr i8, ptr [[TMP3]], i64 1
+// CHECK_BUFFERED-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1
+// CHECK_BUFFERED-NEXT: [[TMP6:%.*]] = icmp eq i8 [[TMP5]], 0
+// CHECK_BUFFERED-NEXT: br i1 [[TMP6]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]]
+// CHECK_BUFFERED: strlen.while.done:
+// CHECK_BUFFERED-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP1]] to i64
+// CHECK_BUFFERED-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP3]] to i64
+// CHECK_BUFFERED-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[TMP7]]
+// CHECK_BUFFERED-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 1
+// CHECK_BUFFERED-NEXT: br label [[STRLEN_JOIN]]
+// CHECK_BUFFERED: strlen.join:
+// CHECK_BUFFERED-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP10]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ]
+// CHECK_BUFFERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 7
+// CHECK_BUFFERED-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 4294967288
+// CHECK_BUFFERED-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 20
+// CHECK_BUFFERED-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+// CHECK_BUFFERED-NEXT: [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 [[TMP15]])
+// CHECK_BUFFERED-NEXT: [[TMP16:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
+// CHECK_BUFFERED-NEXT: br i1 [[TMP16]], label [[ARGPUSH_BLOCK:%.*]], label [[END_BLOCK:%.*]]
+// CHECK_BUFFERED: end.block:
+// CHECK_BUFFERED-NEXT: [[TMP17:%.*]] = xor i1 [[TMP16]], true
+// CHECK_BUFFERED-NEXT: [[PRINTF_RESULT:%.*]] = sext i1 [[TMP17]] to i32
+// CHECK_BUFFERED-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR1]]
+// CHECK_BUFFERED-NEXT: ret void
+// CHECK_BUFFERED: argpush.block:
+// CHECK_BUFFERED-NEXT: [[TMP18:%.*]] = shl i32 [[TMP15]], 2
+// CHECK_BUFFERED-NEXT: [[TMP19:%.*]] = or i32 [[TMP18]], 2
+// CHECK_BUFFERED-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[PRINTF_ALLOC_FN]], align 4
+// CHECK_BUFFERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
+// CHECK_BUFFERED-NEXT: store i64 -2942283388077972797, ptr addrspace(1) [[TMP20]], align 8
+// CHECK_BUFFERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i32 8
+// CHECK_BUFFERED-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 1 [[TMP21]], ptr align 1 [[TMP1]], i64 [[TMP11]], i1 false)
+// CHECK_BUFFERED-NEXT: [[PRINTBUFFNEXTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP21]], i64 [[TMP13]]
+// CHECK_BUFFERED-NEXT: [[TMP22:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK_BUFFERED-NEXT: store i64 [[TMP22]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8
+// CHECK_BUFFERED-NEXT: [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8
+// CHECK_BUFFERED-NEXT: br label [[END_BLOCK]]
+//
+// CHECK_HOSTCALL-LABEL: @test_printf_str_int(
+// CHECK_HOSTCALL-NEXT: entry:
+// CHECK_HOSTCALL-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK_HOSTCALL-NEXT: [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5)
+// CHECK_HOSTCALL-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]]
+// CHECK_HOSTCALL-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR3:[0-9]+]]
+// CHECK_HOSTCALL-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 [[S]], ptr addrspace(4) align 1 @__const.test_printf_str_int.s, i64 4, i1 false)
+// CHECK_HOSTCALL-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(5) [[S]], i64 0, i64 0
+// CHECK_HOSTCALL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]]
+// CHECK_HOSTCALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[ARRAYDECAY]] to ptr
+// CHECK_HOSTCALL-NEXT: [[TMP2:%.*]] = call i64 @__ockl_printf_begin(i64 0)
+// CHECK_HOSTCALL-NEXT: br i1 icmp eq (ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr), ptr null), label [[STRLEN_JOIN:%.*]], label [[STRLEN_WHILE:%.*]]
+// CHECK_HOSTCALL: strlen.while:
+// CHECK_HOSTCALL-NEXT: [[TMP3:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str.2 to ptr), [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[STRLEN_WHILE]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP4]] = getelementptr i8, ptr [[TMP3]], i64 1
+// CHECK_HOSTCALL-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1
+// CHECK_HOSTCALL-NEXT: [[TMP6:%.*]] = icmp eq i8 [[TMP5]], 0
+// CHECK_HOSTCALL-NEXT: br i1 [[TMP6]], label [[STRLEN_WHILE_DONE:%.*]], label [[STRLEN_WHILE]]
+// CHECK_HOSTCALL: strlen.while.done:
+// CHECK_HOSTCALL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP3]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr) to i64)
+// CHECK_HOSTCALL-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 1
+// CHECK_HOSTCALL-NEXT: br label [[STRLEN_JOIN]]
+// CHECK_HOSTCALL: strlen.join:
+// CHECK_HOSTCALL-NEXT: [[TMP10:%.*]] = phi i64 [ [[TMP9]], [[STRLEN_WHILE_DONE]] ], [ 0, [[ENTRY]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP11:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP2]], ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr), i64 [[TMP10]], i32 0)
+// CHECK_HOSTCALL-NEXT: [[TMP12:%.*]] = icmp eq ptr [[TMP1]], null
+// CHECK_HOSTCALL-NEXT: br i1 [[TMP12]], label [[STRLEN_JOIN1:%.*]], label [[STRLEN_WHILE2:%.*]]
+// CHECK_HOSTCALL: strlen.while2:
+// CHECK_HOSTCALL-NEXT: [[TMP13:%.*]] = phi ptr [ [[TMP1]], [[STRLEN_JOIN]] ], [ [[TMP14:%.*]], [[STRLEN_WHILE2]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP14]] = getelementptr i8, ptr [[TMP13]], i64 1
+// CHECK_HOSTCALL-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP13]], align 1
+// CHECK_HOSTCALL-NEXT: [[TMP16:%.*]] = icmp eq i8 [[TMP15]], 0
+// CHECK_HOSTCALL-NEXT: br i1 [[TMP16]], label [[STRLEN_WHILE_DONE3:%.*]], label [[STRLEN_WHILE2]]
+// CHECK_HOSTCALL: strlen.while.done3:
+// CHECK_HOSTCALL-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP1]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP19:%.*]] = sub i64 [[TMP18]], [[TMP17]]
+// CHECK_HOSTCALL-NEXT: [[TMP20:%.*]] = add i64 [[TMP19]], 1
+// CHECK_HOSTCALL-NEXT: br label [[STRLEN_JOIN1]]
+// CHECK_HOSTCALL: strlen.join1:
+// CHECK_HOSTCALL-NEXT: [[TMP21:%.*]] = phi i64 [ [[TMP20]], [[STRLEN_WHILE_DONE3]] ], [ 0, [[STRLEN_JOIN]] ]
+// CHECK_HOSTCALL-NEXT: [[TMP22:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP11]], ptr [[TMP1]], i64 [[TMP21]], i32 0)
+// CHECK_HOSTCALL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK_HOSTCALL-NEXT: [[TMP24:%.*]] = call i64 @__ockl_printf_append_args(i64 [[TMP22]], i32 1, i64 [[TMP23]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1)
+// CHECK_HOSTCALL-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
+// CHECK_HOSTCALL-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR3]]
+// CHECK_HOSTCALL-NEXT: ret void
//
__kernel void test_printf_str_int(int i) {
char s[] = "foo";
diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
index 2195406c144c8ba..99c714d963d2884 100644
--- a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
+++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -26,28 +26,31 @@ using namespace llvm;
#define DEBUG_TYPE "amdgpu-emit-printf"
-static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) {
+static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg,
+ bool IsBuffered) {
+ const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
auto Int64Ty = Builder.getInt64Ty();
auto Ty = Arg->getType();
if (auto IntTy = dyn_cast<IntegerType>(Ty)) {
- switch (IntTy->getBitWidth()) {
- case 32:
- return Builder.CreateZExt(Arg, Int64Ty);
- case 64:
- return Arg;
+ if (IntTy->getBitWidth() < 64) {
+ return Builder.CreateZExt(Arg, Builder.getInt64Ty());
}
}
- if (Ty->getTypeID() == Type::DoubleTyID) {
+ if (Ty->isFloatingPointTy()) {
+ if (DL.getTypeAllocSize(Ty) < 8)
+ Arg = Builder.CreateFPExt(Arg, Builder.getDoubleTy());
+ if (IsBuffered)
+ return Arg;
return Builder.CreateBitCast(Arg, Int64Ty);
}
- if (isa<PointerType>(Ty)) {
+ if (!IsBuffered && isa<PointerType>(Ty)) {
return Builder.CreatePtrToInt(Arg, Int64Ty);
}
- llvm_unreachable("unexpected type");
+ return Arg;
}
static Value *callPrintfBegin(IRBuilder<> &Builder, Value *Version) {
@@ -74,8 +77,8 @@ static Value *callAppendArgs(IRBuilder<> &Builder, Value *Desc, int NumArgs,
}
static Value *appendArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
- bool IsLast) {
- auto Arg0 = fitArgInto64Bits(Builder, Arg);
+ bool IsLast, bool IsBuffered) {
+ auto Arg0 = fitArgInto64Bits(Builder, Arg, IsBuffered);
auto Zero = Builder.getInt64(0);
return callAppendArgs(Builder, Desc, 1, Arg0, Zero, Zero, Zero, Zero, Zero,
Zero, IsLast);
@@ -170,20 +173,49 @@ static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg,
return callAppendStringN(Builder, Desc, Arg, Length, IsLast);
}
+static Value *appendVectorArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+ bool IsLast, bool IsBuffered) {
+ assert(Arg->getType()->isVectorTy() && "incorrent append* function");
+ auto VectorTy = dyn_cast<FixedVectorType>(Arg->getType());
+ auto Zero = Builder.getInt64(0);
+ if (VectorTy) {
+ for (unsigned int i = 0; i < VectorTy->getNumElements() - 1; i++) {
+ auto Val = Builder.CreateExtractElement(Arg, i);
+ Desc = callAppendArgs(Builder, Desc, 1,
+ fitArgInto64Bits(Builder, Val, IsBuffered), Zero,
+ Zero, Zero, Zero, Zero, Zero, false);
+ }
+
+ Value* Val =
+ Builder.CreateExtractElement(Arg, VectorTy->getNumElements() - 1);
+ return callAppendArgs(Builder, Desc, 1,
+ fitArgInto64Bits(Builder, Val, IsBuffered), Zero,
+ Zero, Zero, Zero, Zero, Zero, IsLast);
+ }
+ return nullptr;
+}
+
static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
- bool SpecIsCString, bool IsLast) {
+ bool SpecIsCString, bool IsVector, bool IsLast,
+ bool IsBuffered) {
if (SpecIsCString && isa<PointerType>(Arg->getType())) {
return appendString(Builder, Desc, Arg, IsLast);
}
- // If the format specifies a string but the argument is not, the frontend will
- // have printed a warning. We just rely on undefined behaviour and send the
- // argument anyway.
- return appendArg(Builder, Desc, Arg, IsLast);
+
+ if (IsVector) {
+ return appendVectorArg(Builder, Desc, Arg, IsLast, IsBuffered);
+ }
+
+ // If the format specifies a string but the argument is not, the frontend
+ // will have printed a warning. We just rely on undefined behaviour and send
+ // the argument anyway.
+ return appendArg(Builder, Desc, Arg, IsLast, IsBuffered);
}
// Scan the format string to locate all specifiers, and mark the ones that
// specify a string, i.e, the "%s" specifier with optional '*' characters.
-static void locateCStrings(SparseBitVector<8> &BV, StringRef Str) {
+static void locateCStringsAndVectors(SparseBitVector<8> &BV,
+ SparseBitVector<8> &OV, StringRef Str) {
static const char ConvSpecifiers[] = "diouxXfFeEgGaAcspn";
size_t SpecPos = 0;
// Skip the first argument, the format string.
@@ -194,6 +226,8 @@ static void locateCStrings(SparseBitVector<8> &BV, StringRef Str) {
SpecPos += 2;
continue;
}
+ if (Str.find_first_of("v", SpecPos) != StringRef::npos)
+ OV.set(ArgIdx);
auto SpecEnd = Str.find_first_of(ConvSpecifiers, SpecPos);
if (SpecEnd == StringRef::npos)
return;
@@ -224,7 +258,8 @@ struct StringData {
static Value *callBufferedPrintfStart(
IRBuilder<> &Builder, ArrayRef<Value *> Args, Value *Fmt,
bool isConstFmtStr, SparseBitVector<8> &SpecIsCString,
- SmallVectorImpl<StringData> &StringContents, Value *&ArgSize) {
+ SparseBitVector<8> &OCLVectors, SmallVectorImpl<StringData> &StringContents,
+ Value *&ArgSize) {
Module *M = Builder.GetInsertBlock()->getModule();
Value *NonConstStrLen = nullptr;
Value *LenWithNull = nullptr;
@@ -278,7 +313,12 @@ static Value *callBufferedPrintfStart(
StringData(StringRef(), LenWithNull, LenWithNullAligned, false));
}
} else {
- int AllocSize = M->getDataLayout().getTypeAllocSize(Args[i]->getType());
+ int AllocSize = 0;
+ if (OCLVectors.test(i)) {
+ auto VecArg = cast<FixedVectorType>(Args[i]->getType());
+ AllocSize = VecArg->getNumElements() * 8;
+ } else
+ AllocSize = M->getDataLayout().getTypeAllocSize(Args[i]->getType());
// We end up expanding non string arguments to 8 bytes
// (args smaller than 8 bytes)
BufSize += std::max(AllocSize, 8);
@@ -352,30 +392,10 @@ static void processConstantStringArg(StringData *SD, IRBuilder<> &Builder,
WhatToStore.push_back(ConstantInt::get(Builder.getInt32Ty(), 0));
}
-static Value *processNonStringArg(Value *Arg, IRBuilder<> &Builder) {
- const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
- auto Ty = Arg->getType();
-
- if (auto IntTy = dyn_cast<IntegerType>(Ty)) {
- if (IntTy->getBitWidth() < 64) {
- return Builder.CreateZExt(Arg, Builder.getInt64Ty());
- }
- }
-
- if (Ty->isFloatingPointTy()) {
- if (DL.getTypeAllocSize(Ty) < 8) {
- return Builder.CreateFPExt(Arg, Builder.getDoubleTy());
- }
- }
-
- return Arg;
-}
-
-static void
-callBufferedPrintfArgPush(IRBuilder<> &Builder, ArrayRef<Value *> Args,
- Value *PtrToStore, SparseBitVector<8> &SpecIsCString,
- SmallVectorImpl<StringData> &StringContents,
- bool IsConstFmtStr) {
+static void callBufferedPrintfArgPush(
+ IRBuilder<> &Builder, ArrayRef<Value *> Args, Value *PtrToStore,
+ SparseBitVector<8> &SpecIsCString, SparseBitVector<8> &OCLVectors,
+ SmallVectorImpl<StringData> &StringContents, bool IsConstFmtStr) {
Module *M = Builder.GetInsertBlock()->getModule();
const DataLayout &DL = M->getDataLayout();
auto StrIt = StringContents.begin();
@@ -407,7 +427,17 @@ callBufferedPrintfArgPush(IRBuilder<> &Builder, ArrayRef<Value *> Args,
continue;
}
} else {
- WhatToStore.push_back(processNonStringArg(Args[i], Builder));
+ if (OCLVectors.test(i)) {
+ auto VectorTy = dyn_cast<FixedVectorType>(Args[i]->getType());
+ auto VecArg = Args[i];
+ for (unsigned int Num = 0; Num < VectorTy->getNumElements(); Num++) {
+ auto Val = Builder.CreateExtractElement(VecArg, Num);
+ WhatToStore.push_back(
+ fitArgInto64Bits(Builder, Val, /*IsBuffered*/ true));
+ }
+ } else
+ WhatToStore.push_back(
+ fitArgInto64Bits(Builder, Args[i], /*IsBuffered*/ true));
}
for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) {
@@ -434,10 +464,11 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args,
auto Fmt = Args[0];
SparseBitVector<8> SpecIsCString;
+ SparseBitVector<8> OCLVectors;
StringRef FmtStr;
if (getConstantStringInfo(Fmt, FmtStr))
- locateCStrings(SpecIsCString, FmtStr);
+ locateCStringsAndVectors(SpecIsCString, OCLVectors, FmtStr);
if (IsBuffered) {
SmallVector<StringData, 8> StringContents;
@@ -448,9 +479,9 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args,
bool IsConstFmtStr = !FmtStr.empty();
Value *ArgSize = nullptr;
- Value *Ptr =
- callBufferedPrintfStart(Builder, Args, Fmt, IsConstFmtStr,
- SpecIsCString, StringContents, ArgSize);
+ Value *Ptr = callBufferedPrintfStart(Builder, Args, Fmt, IsConstFmtStr,
+ SpecIsCString, OCLVectors,
+ StringContents, ArgSize);
// The buffered version still follows OpenCL printf standards for
// printf return value, i.e 0 on success, -1 on failure.
@@ -513,8 +544,8 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args,
}
// Push The printf arguments onto buffer
- callBufferedPrintfArgPush(Builder, Args, Ptr, SpecIsCString, StringContents,
- IsConstFmtStr);
+ callBufferedPrintfArgPush(Builder, Args, Ptr, SpecIsCString, OCLVectors,
+ StringContents, IsConstFmtStr);
// End block, returns -1 on failure
BranchInst::Create(End, ArgPush);
@@ -531,7 +562,9 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args,
for (unsigned int i = 1; i != NumOps; ++i) {
bool IsLast = i == NumOps - 1;
bool IsCString = SpecIsCString.test(i);
- Desc = processArg(Builder, Desc, Args[i], IsCString, IsLast);
+ bool IsVector = OCLVectors.test(i);
+ Desc = processArg(Builder, Desc, Args[i], IsCString, IsVector, IsLast,
+ IsBuffered);
}
return Builder.CreateTrunc(Desc, Builder.getInt32Ty());
More information about the llvm-commits
mailing list