[llvm-branch-commits] [clang] [CIR][NVPTX] Implement sync and cluster barrier builtins (PR #195217)

David Rivera via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Apr 30 22:09:59 PDT 2026


https://github.com/RiverDave updated https://github.com/llvm/llvm-project/pull/195217

>From c8fa22e77b1ff3e5c7d1153db9b96b241f247dbb Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Fri, 1 May 2026 00:58:41 -0400
Subject: [PATCH 1/2] [CIR][NVPTX] Implement sync and cluster barrier builtins

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp  | 62 +++++++++----------
 .../NVPTX/builtins-nvptx-sync.cu              | 45 ++++++++++++++
 .../CodeGenBuiltins/NVPTX/builtins-sm90.cu    | 44 +++++++++++++
 3 files changed, 119 insertions(+), 32 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-nvptx-sync.cu
 create mode 100644 clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-sm90.cu

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp
index 1c527b4dab614..2babb7d047993 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp
@@ -903,45 +903,43 @@ CIRGenFunction::emitNVPTXBuiltinExpr(unsigned builtinId, const CallExpr *expr) {
                      getContext().BuiltinInfo.getName(builtinId));
     return mlir::Value{};
   case NVPTX::BI__nvvm_barrier_cluster_arrive:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented NVPTX builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinId));
-    return mlir::Value{};
+    return builder.emitIntrinsicCallOp(getLoc(expr->getExprLoc()),
+                                       "nvvm.barrier.cluster.arrive",
+                                       builder.getVoidTy());
   case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented NVPTX builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinId));
-    return mlir::Value{};
+    return builder.emitIntrinsicCallOp(getLoc(expr->getExprLoc()),
+                                       "nvvm.barrier.cluster.arrive.relaxed",
+                                       builder.getVoidTy());
   case NVPTX::BI__nvvm_barrier_cluster_wait:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented NVPTX builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinId));
-    return mlir::Value{};
+    return builder.emitIntrinsicCallOp(getLoc(expr->getExprLoc()),
+                                       "nvvm.barrier.cluster.wait",
+                                       builder.getVoidTy());
   case NVPTX::BI__nvvm_fence_sc_cluster:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented NVPTX builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinId));
-    return mlir::Value{};
+    return builder.emitIntrinsicCallOp(getLoc(expr->getExprLoc()),
+                                       "nvvm.fence.sc.cluster",
+                                       builder.getVoidTy());
   case NVPTX::BI__nvvm_bar_sync:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented NVPTX builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinId));
-    return mlir::Value{};
+    return builder.emitIntrinsicCallOp(
+        getLoc(expr->getExprLoc()), "nvvm.barrier.cta.sync.aligned.all",
+        builder.getVoidTy(),
+        mlir::ValueRange{emitScalarExpr(expr->getArg(0))});
   case NVPTX::BI__syncthreads:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented NVPTX builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinId));
-    return mlir::Value{};
+    return builder.emitIntrinsicCallOp(
+        getLoc(expr->getExprLoc()), "nvvm.barrier.cta.sync.aligned.all",
+        builder.getVoidTy(),
+        mlir::ValueRange{builder.getConstInt(
+            getLoc(expr->getExprLoc()), builder.getSInt32Ty(), 0)});
   case NVPTX::BI__nvvm_barrier_sync:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented NVPTX builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinId));
-    return mlir::Value{};
+    return builder.emitIntrinsicCallOp(
+        getLoc(expr->getExprLoc()), "nvvm.barrier.cta.sync.all",
+        builder.getVoidTy(),
+        mlir::ValueRange{emitScalarExpr(expr->getArg(0))});
   case NVPTX::BI__nvvm_barrier_sync_cnt:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented NVPTX builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinId));
-    return mlir::Value{};
+    return builder.emitIntrinsicCallOp(
+        getLoc(expr->getExprLoc()), "nvvm.barrier.cta.sync.count",
+        builder.getVoidTy(),
+        mlir::ValueRange{emitScalarExpr(expr->getArg(0)),
+                         emitScalarExpr(expr->getArg(1))});
   default:
     return std::nullopt;
   }
diff --git a/clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-nvptx-sync.cu b/clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-nvptx-sync.cu
new file mode 100644
index 0000000000000..b1a57856dc392
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-nvptx-sync.cu
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_70 \
+// RUN:            -target-feature +ptx62 -fclangir -fcuda-is-device \
+// RUN:            -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_70 \
+// RUN:            -target-feature +ptx62 -fclangir -fcuda-is-device \
+// RUN:            -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_70 \
+// RUN:            -target-feature +ptx62 -fcuda-is-device \
+// RUN:            -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+#define __device__ __attribute__((device))
+
+// Tests CIR/LLVM lowering for NVPTX CTA-level sync barrier builtins.
+// Mirrors the relevant slices of clang/test/CodeGen/builtins-nvptx.c and
+// clang/test/CodeGen/builtins-nvptx-ptx60.cu.
+
+// CIR-LABEL: cir.func {{.*}} @_Z9nvvm_syncj
+// LLVM-LABEL: define{{.*}} void @_Z9nvvm_syncj(
+// OGCG-LABEL: define{{.*}} void @_Z9nvvm_syncj(
+__device__ void nvvm_sync(unsigned mask) {
+  // CIR:  cir.call_llvm_intrinsic "nvvm.barrier.cta.sync.aligned.all"
+  // LLVM: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+  // OGCG: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+  __nvvm_bar_sync(0);
+
+  // CIR:  cir.call_llvm_intrinsic "nvvm.barrier.cta.sync.aligned.all"
+  // LLVM: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+  // OGCG: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+  __syncthreads();
+
+  // CIR:  cir.call_llvm_intrinsic "nvvm.barrier.cta.sync.all"
+  // LLVM: call void @llvm.nvvm.barrier.cta.sync.all(i32 %{{.*}})
+  // OGCG: call void @llvm.nvvm.barrier.cta.sync.all(i32 %{{.*}})
+  __nvvm_barrier_sync(mask);
+
+  // CIR:  cir.call_llvm_intrinsic "nvvm.barrier.cta.sync.count"
+  // LLVM: call void @llvm.nvvm.barrier.cta.sync.count(i32 %{{.*}}, i32 0)
+  // OGCG: call void @llvm.nvvm.barrier.cta.sync.count(i32 %{{.*}}, i32 0)
+  __nvvm_barrier_sync_cnt(mask, 0);
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-sm90.cu b/clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-sm90.cu
new file mode 100644
index 0000000000000..3c37c0bd927c9
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-sm90.cu
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_90 \
+// RUN:            -target-feature +ptx80 -fclangir -fcuda-is-device \
+// RUN:            -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_90 \
+// RUN:            -target-feature +ptx80 -fclangir -fcuda-is-device \
+// RUN:            -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_90 \
+// RUN:            -target-feature +ptx80 -fcuda-is-device \
+// RUN:            -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+#define __global__ __attribute__((global))
+
+// Tests CIR/LLVM lowering for sm_90 cluster barrier and fence builtins.
+// Mirrors the relevant slice of clang/test/CodeGenCUDA/builtins-sm90.cu.
+
+// CIR-LABEL: cir.func {{.*}} @_Z6kernelv
+// LLVM-LABEL: define{{.*}} void @_Z6kernelv(
+// OGCG-LABEL: define{{.*}} void @_Z6kernelv(
+__global__ void kernel() {
+  // CIR:  cir.call_llvm_intrinsic "nvvm.barrier.cluster.arrive"
+  // LLVM: call void @llvm.nvvm.barrier.cluster.arrive()
+  // OGCG: call void @llvm.nvvm.barrier.cluster.arrive()
+  __nvvm_barrier_cluster_arrive();
+
+  // CIR:  cir.call_llvm_intrinsic "nvvm.barrier.cluster.arrive.relaxed"
+  // LLVM: call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
+  // OGCG: call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
+  __nvvm_barrier_cluster_arrive_relaxed();
+
+  // CIR:  cir.call_llvm_intrinsic "nvvm.barrier.cluster.wait"
+  // LLVM: call void @llvm.nvvm.barrier.cluster.wait()
+  // OGCG: call void @llvm.nvvm.barrier.cluster.wait()
+  __nvvm_barrier_cluster_wait();
+
+  // CIR:  cir.call_llvm_intrinsic "nvvm.fence.sc.cluster"
+  // LLVM: call void @llvm.nvvm.fence.sc.cluster()
+  // OGCG: call void @llvm.nvvm.fence.sc.cluster()
+  __nvvm_fence_sc_cluster();
+}

>From 5bf4913546c1af0ce0d518b6dd955eb398a6f0d6 Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg at gmail.com>
Date: Fri, 1 May 2026 01:09:46 -0400
Subject: [PATCH 2/2] fix fmt

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp
index 2babb7d047993..0eb55787f4d88 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp
@@ -921,19 +921,17 @@ CIRGenFunction::emitNVPTXBuiltinExpr(unsigned builtinId, const CallExpr *expr) {
   case NVPTX::BI__nvvm_bar_sync:
     return builder.emitIntrinsicCallOp(
         getLoc(expr->getExprLoc()), "nvvm.barrier.cta.sync.aligned.all",
-        builder.getVoidTy(),
-        mlir::ValueRange{emitScalarExpr(expr->getArg(0))});
+        builder.getVoidTy(), mlir::ValueRange{emitScalarExpr(expr->getArg(0))});
   case NVPTX::BI__syncthreads:
     return builder.emitIntrinsicCallOp(
         getLoc(expr->getExprLoc()), "nvvm.barrier.cta.sync.aligned.all",
         builder.getVoidTy(),
-        mlir::ValueRange{builder.getConstInt(
-            getLoc(expr->getExprLoc()), builder.getSInt32Ty(), 0)});
+        mlir::ValueRange{builder.getConstInt(getLoc(expr->getExprLoc()),
+                                             builder.getSInt32Ty(), 0)});
   case NVPTX::BI__nvvm_barrier_sync:
     return builder.emitIntrinsicCallOp(
         getLoc(expr->getExprLoc()), "nvvm.barrier.cta.sync.all",
-        builder.getVoidTy(),
-        mlir::ValueRange{emitScalarExpr(expr->getArg(0))});
+        builder.getVoidTy(), mlir::ValueRange{emitScalarExpr(expr->getArg(0))});
   case NVPTX::BI__nvvm_barrier_sync_cnt:
     return builder.emitIntrinsicCallOp(
         getLoc(expr->getExprLoc()), "nvvm.barrier.cta.sync.count",



More information about the llvm-branch-commits mailing list