[clang] [CIR][CUDA] Support device-side printf for NVPTX (PR #196573)

Fri May 8 09:37:49 PDT 2026

https://github.com/AbdallahRashed created https://github.com/llvm/llvm-project/pull/196573

Implement device-side printf lowering for NVPTX targets in CIR codegen. The variadic arguments are packed into a stack-allocated struct and passed to vprintf, matching the classic codegen behavior in CGGPUBuiltin.cpp

When the target triple is NVPTX and the builtin is printf/__builtin_printf, we route to emitNVPTXDevicePrintfCallExpr
The no-varargs case passes a null pointer directly.

AMDGCN device printf remains NYI.
part of https://github.com/llvm/llvm-project/issues/179278

>From 2c8f237741352026fda4cbca57a3015257d38127 Mon Sep 17 00:00:00 2001
From: AbdallahRashed <abdallah.mrashed at gmail.com>
Date: Fri, 8 May 2026 18:19:23 +0200
Subject: [PATCH] [CIR][CUDA] Support device-side printf for NVPTX

Implement device-side printf lowering for NVPTX targets in CIR codegen.
The variadic arguments are packed into a stack-allocated struct and passed
to vprintf, matching the classic codegen behavior in CGGPUBuiltin.cpp

When the target triple is NVPTX and the builtin is printf/__builtin_printf,
we route to emitNVPTXDevicePrintfCallExpr
The no-varargs case passes a null pointer directly.

AMDGCN device printf remains NYI.
---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp      |   7 ++
 clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp | 100 +++++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenFunction.h       |   3 +
 clang/test/CIR/CodeGenCUDA/device-printf.cu  |  42 ++++++++
 4 files changed, 152 insertions(+)
 create mode 100644 clang/test/CIR/CodeGenCUDA/device-printf.cu

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index fe932834e9b55..9752d2571ed86 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -2400,6 +2400,13 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return errorBuiltinNYI(*this, e, builtinID);
   case Builtin::BI__builtin_printf:
   case Builtin::BIprintf:
+    assert(e->getNumArgs() >= 1); // printf always has at least one arg.
+    if (getTarget().getTriple().isAMDGCN()) {
+      return errorBuiltinNYI(*this, e, builtinID);
+    }
+    if (getTarget().getTriple().isNVPTX()) {
+      return RValue::get(emitNVPTXDevicePrintfCallExpr(e));
+    }
     break;
   case Builtin::BI__builtin_canonicalize:
   case Builtin::BI__builtin_canonicalizef:
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp
index 52f98af8028b4..bd0607261e88b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp
@@ -14,6 +14,7 @@
 
 #include "mlir/IR/Value.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "clang/CIR/Dialect/IR/CIRDataLayout.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 
 using namespace clang;
@@ -1008,3 +1009,102 @@ CIRGenFunction::emitNVPTXBuiltinExpr(unsigned builtinId, const CallExpr *expr) {
     return std::nullopt;
   }
 }
+
+// vprintf takes two args: A format string, and a pointer to a buffer containing
+// the varargs.
+//
+// For example, the call
+//
+//   printf("format string", arg1, arg2, arg3);
+//
+// is converted into something resembling
+//
+//   struct Tmp {
+//     Arg1 a1;
+//     Arg2 a2;
+//     Arg3 a3;
+//   };
+//   char* buf = alloca(sizeof(Tmp));
+//   *(Tmp*)buf = {a1, a2, a3};
+//   vprintf("format string", buf);
+//
+// `buf` is aligned to the max of {alignof(Arg1), ...}. Furthermore, each of
+// the args is itself aligned to its preferred alignment.
+//
+// Note that by the time this function runs, the arguments have already
+// undergone the standard C vararg promotion (short -> int, float -> double
+// etc). In this function we pack the arguments into the buffer described above.
+static mlir::Value packArgsIntoNVPTXFormatBuffer(CIRGenFunction &cgf,
+                                                 const CallArgList &args,
+                                                 mlir::Location loc) {
+  const cir::CIRDataLayout dataLayout = cgf.cgm.getDataLayout();
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+
+  if (args.size() <= 1)
+    // If there are no arguments other than the format string,
+    // pass a nullptr to vprintf.
+    return builder.getNullPtr(builder.getVoidPtrTy(), loc);
+
+  llvm::SmallVector<mlir::Type, 8> argTypes;
+  for (const auto &arg : llvm::drop_begin(args))
+    argTypes.push_back(arg.getRValue(cgf, loc).getValue().getType());
+
+  // We can directly store the arguments into a struct, and the alignment
+  // would automatically be correct. That's because vprintf does not
+  // accept aggregates.
+  mlir::Type allocaTy = builder.getAnonRecordTy(argTypes);
+  auto allocaAlign = clang::CharUnits::fromQuantity(
+      dataLayout.getABITypeAlign(allocaTy).value());
+  Address allocaAddr =
+      cgf.createTempAlloca(allocaTy, allocaAlign, loc, "printf_args");
+  mlir::Value alloca = allocaAddr.getPointer();
+
+  for (auto [i, arg] : llvm::enumerate(llvm::drop_begin(args))) {
+    mlir::Value member = builder.createGetMember(
+        loc, cir::PointerType::get(argTypes[i]), alloca, /*name=*/"",
+        /*index=*/i);
+    auto abiAlign = clang::CharUnits::fromQuantity(
+        dataLayout.getABITypeAlign(argTypes[i]).value());
+    cir::StoreOp::create(builder, loc, arg.getRValue(cgf, loc).getValue(),
+                         member, /*is_volatile=*/false,
+                         builder.getAlignmentAttr(abiAlign),
+                         /*sync_scope=*/cir::SyncScopeKindAttr{},
+                         /*mem_order=*/cir::MemOrderAttr{});
+  }
+
+  return builder.createBitcast(alloca, builder.getVoidPtrTy());
+}
+
+mlir::Value
+CIRGenFunction::emitNVPTXDevicePrintfCallExpr(const CallExpr *expr) {
+  assert(cgm.getTriple().isNVPTX());
+  CallArgList args;
+  emitCallArgs(args,
+               expr->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
+               expr->arguments(), expr->getDirectCallee());
+
+  mlir::Location loc = getLoc(expr->getBeginLoc());
+
+  // Except the format string, no non-scalar arguments are allowed for
+  // device-side printf.
+  bool hasNonScalar =
+      llvm::any_of(llvm::drop_begin(args), [&](const CallArg &a) {
+        return !a.getRValue(*this, loc).isScalar();
+      });
+  if (hasNonScalar) {
+    cgm.errorUnsupported(expr, "non-scalar args to printf");
+    return builder.getConstInt(loc, builder.getSInt32Ty(), 0);
+  }
+
+  mlir::Value packedData = packArgsIntoNVPTXFormatBuffer(*this, args, loc);
+
+  // int vprintf(char *format, void *packedData);
+  auto vprintf = cgm.createRuntimeFunction(
+      cir::FuncType::get(
+          {cir::PointerType::get(builder.getSInt8Ty()), builder.getVoidPtrTy()},
+          builder.getSInt32Ty()),
+      "vprintf");
+  auto formatString = args[0].getRValue(*this, loc).getValue();
+  llvm::SmallVector<mlir::Value, 2> callArgs = {formatString, packedData};
+  return builder.createCallOp(loc, vprintf, callArgs).getResult();
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index de91b2f903018..dcc80817c866b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -2029,6 +2029,9 @@ class CIRGenFunction : public CIRGenTypeCache {
   std::optional<mlir::Value> emitNVPTXBuiltinExpr(unsigned builtinID,
                                                   const CallExpr *expr);
 
+  /// Emit a device-side printf call for NVPTX targets.
+  mlir::Value emitNVPTXDevicePrintfCallExpr(const CallExpr *expr);
+
   LValue emitOpaqueValueLValue(const OpaqueValueExpr *e);
 
   LValue emitConditionalOperatorLValue(const AbstractConditionalOperator *expr);
diff --git a/clang/test/CIR/CodeGenCUDA/device-printf.cu b/clang/test/CIR/CodeGenCUDA/device-printf.cu
new file mode 100644
index 0000000000000..8528cb39f0450
--- /dev/null
+++ b/clang/test/CIR/CodeGenCUDA/device-printf.cu
@@ -0,0 +1,42 @@
+#include "Inputs/cuda.h"
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_80 -x cuda \
+// RUN:            -fcuda-is-device -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_80 -x cuda \
+// RUN:            -fcuda-is-device -fclangir -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+__device__ void print_int() {
+  printf("%d", 42);
+}
+
+// CIR: cir.func no_inline dso_local @_Z9print_intv()
+// CIR:   %[[#ALLOCA:]] = cir.alloca !rec_anon_struct
+// CIR:   %[[#VAL:]] = cir.const #cir.int<42> : !s32i
+// CIR:   %[[#FIELD:]] = cir.get_member %[[#ALLOCA]][0]
+// CIR:   cir.store align(4) %[[#VAL]], %[[#FIELD]]
+// CIR:   %[[#BUF:]] = cir.cast bitcast %[[#ALLOCA]] : !cir.ptr<!rec_anon_struct> -> !cir.ptr<!void>
+// CIR:   cir.call @vprintf(%{{.+}}, %[[#BUF]])
+// CIR:   cir.return
+
+// LLVM: define dso_local void @_Z9print_intv()
+// LLVM:   %[[#PACKED:]] = alloca { i32 }
+// LLVM:   %[[#GEP:]] = getelementptr inbounds nuw { i32 }, ptr %[[#PACKED]], i32 0, i32 0
+// LLVM:   store i32 42, ptr %[[#GEP]], align 4
+// LLVM:   call i32 @vprintf(ptr @{{.*}}, ptr %[[#PACKED]])
+// LLVM:   ret void
+
+__device__ void print_no_args() {
+  printf("hello world");
+}
+
+// CIR: cir.func no_inline dso_local @_Z13print_no_argsv()
+// CIR:   %[[#NULL:]] = cir.const #cir.ptr<null> : !cir.ptr<!void>
+// CIR:   cir.call @vprintf(%{{.+}}, %[[#NULL]])
+// CIR:   cir.return
+
+// LLVM: define dso_local void @_Z13print_no_argsv()
+// LLVM:   call i32 @vprintf(ptr @{{.*}}, ptr null)
+// LLVM:   ret void