[llvm-branch-commits] [clang] [CIR][NVPTX] Implement sync and cluster barrier builtins (PR #195217)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Apr 30 22:36:40 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang
Author: David Rivera (RiverDave)
<details>
<summary>Changes</summary>
Related: https://github.com/llvm/llvm-project/issues/179278, https://github.com/llvm/llvm-project/issues/175871
---
Full diff: https://github.com/llvm/llvm-project/pull/195217.diff
3 Files Affected:
- (modified) clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp (+28-32)
- (added) clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-nvptx-sync.cu (+45)
- (added) clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-sm90.cu (+44)
``````````diff
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp
index 1c527b4dab614..0eb55787f4d88 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp
@@ -903,45 +903,41 @@ CIRGenFunction::emitNVPTXBuiltinExpr(unsigned builtinId, const CallExpr *expr) {
getContext().BuiltinInfo.getName(builtinId));
return mlir::Value{};
case NVPTX::BI__nvvm_barrier_cluster_arrive:
- cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented NVPTX builtin call: ") +
- getContext().BuiltinInfo.getName(builtinId));
- return mlir::Value{};
+ return builder.emitIntrinsicCallOp(getLoc(expr->getExprLoc()),
+ "nvvm.barrier.cluster.arrive",
+ builder.getVoidTy());
case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed:
- cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented NVPTX builtin call: ") +
- getContext().BuiltinInfo.getName(builtinId));
- return mlir::Value{};
+ return builder.emitIntrinsicCallOp(getLoc(expr->getExprLoc()),
+ "nvvm.barrier.cluster.arrive.relaxed",
+ builder.getVoidTy());
case NVPTX::BI__nvvm_barrier_cluster_wait:
- cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented NVPTX builtin call: ") +
- getContext().BuiltinInfo.getName(builtinId));
- return mlir::Value{};
+ return builder.emitIntrinsicCallOp(getLoc(expr->getExprLoc()),
+ "nvvm.barrier.cluster.wait",
+ builder.getVoidTy());
case NVPTX::BI__nvvm_fence_sc_cluster:
- cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented NVPTX builtin call: ") +
- getContext().BuiltinInfo.getName(builtinId));
- return mlir::Value{};
+ return builder.emitIntrinsicCallOp(getLoc(expr->getExprLoc()),
+ "nvvm.fence.sc.cluster",
+ builder.getVoidTy());
case NVPTX::BI__nvvm_bar_sync:
- cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented NVPTX builtin call: ") +
- getContext().BuiltinInfo.getName(builtinId));
- return mlir::Value{};
+ return builder.emitIntrinsicCallOp(
+ getLoc(expr->getExprLoc()), "nvvm.barrier.cta.sync.aligned.all",
+ builder.getVoidTy(), mlir::ValueRange{emitScalarExpr(expr->getArg(0))});
case NVPTX::BI__syncthreads:
- cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented NVPTX builtin call: ") +
- getContext().BuiltinInfo.getName(builtinId));
- return mlir::Value{};
+ return builder.emitIntrinsicCallOp(
+ getLoc(expr->getExprLoc()), "nvvm.barrier.cta.sync.aligned.all",
+ builder.getVoidTy(),
+ mlir::ValueRange{builder.getConstInt(getLoc(expr->getExprLoc()),
+ builder.getSInt32Ty(), 0)});
case NVPTX::BI__nvvm_barrier_sync:
- cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented NVPTX builtin call: ") +
- getContext().BuiltinInfo.getName(builtinId));
- return mlir::Value{};
+ return builder.emitIntrinsicCallOp(
+ getLoc(expr->getExprLoc()), "nvvm.barrier.cta.sync.all",
+ builder.getVoidTy(), mlir::ValueRange{emitScalarExpr(expr->getArg(0))});
case NVPTX::BI__nvvm_barrier_sync_cnt:
- cgm.errorNYI(expr->getSourceRange(),
- std::string("unimplemented NVPTX builtin call: ") +
- getContext().BuiltinInfo.getName(builtinId));
- return mlir::Value{};
+ return builder.emitIntrinsicCallOp(
+ getLoc(expr->getExprLoc()), "nvvm.barrier.cta.sync.count",
+ builder.getVoidTy(),
+ mlir::ValueRange{emitScalarExpr(expr->getArg(0)),
+ emitScalarExpr(expr->getArg(1))});
default:
return std::nullopt;
}
diff --git a/clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-nvptx-sync.cu b/clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-nvptx-sync.cu
new file mode 100644
index 0000000000000..b1a57856dc392
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-nvptx-sync.cu
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_70 \
+// RUN: -target-feature +ptx62 -fclangir -fcuda-is-device \
+// RUN: -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_70 \
+// RUN: -target-feature +ptx62 -fclangir -fcuda-is-device \
+// RUN: -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_70 \
+// RUN: -target-feature +ptx62 -fcuda-is-device \
+// RUN: -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+#define __device__ __attribute__((device))
+
+// Tests CIR/LLVM lowering for NVPTX CTA-level sync barrier builtins.
+// Mirrors the relevant slices of clang/test/CodeGen/builtins-nvptx.c and
+// clang/test/CodeGen/builtins-nvptx-ptx60.cu.
+
+// CIR-LABEL: cir.func {{.*}} @_Z9nvvm_syncj
+// LLVM-LABEL: define{{.*}} void @_Z9nvvm_syncj(
+// OGCG-LABEL: define{{.*}} void @_Z9nvvm_syncj(
+__device__ void nvvm_sync(unsigned mask) {
+ // CIR: cir.call_llvm_intrinsic "nvvm.barrier.cta.sync.aligned.all"
+ // LLVM: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+ // OGCG: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+ __nvvm_bar_sync(0);
+
+ // CIR: cir.call_llvm_intrinsic "nvvm.barrier.cta.sync.aligned.all"
+ // LLVM: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+ // OGCG: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+ __syncthreads();
+
+ // CIR: cir.call_llvm_intrinsic "nvvm.barrier.cta.sync.all"
+ // LLVM: call void @llvm.nvvm.barrier.cta.sync.all(i32 %{{.*}})
+ // OGCG: call void @llvm.nvvm.barrier.cta.sync.all(i32 %{{.*}})
+ __nvvm_barrier_sync(mask);
+
+ // CIR: cir.call_llvm_intrinsic "nvvm.barrier.cta.sync.count"
+ // LLVM: call void @llvm.nvvm.barrier.cta.sync.count(i32 %{{.*}}, i32 0)
+ // OGCG: call void @llvm.nvvm.barrier.cta.sync.count(i32 %{{.*}}, i32 0)
+ __nvvm_barrier_sync_cnt(mask, 0);
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-sm90.cu b/clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-sm90.cu
new file mode 100644
index 0000000000000..3c37c0bd927c9
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/NVPTX/builtins-sm90.cu
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_90 \
+// RUN: -target-feature +ptx80 -fclangir -fcuda-is-device \
+// RUN: -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_90 \
+// RUN: -target-feature +ptx80 -fclangir -fcuda-is-device \
+// RUN: -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_90 \
+// RUN: -target-feature +ptx80 -fcuda-is-device \
+// RUN: -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+#define __global__ __attribute__((global))
+
+// Tests CIR/LLVM lowering for sm_90 cluster barrier and fence builtins.
+// Mirrors the relevant slice of clang/test/CodeGenCUDA/builtins-sm90.cu.
+
+// CIR-LABEL: cir.func {{.*}} @_Z6kernelv
+// LLVM-LABEL: define{{.*}} void @_Z6kernelv(
+// OGCG-LABEL: define{{.*}} void @_Z6kernelv(
+__global__ void kernel() {
+ // CIR: cir.call_llvm_intrinsic "nvvm.barrier.cluster.arrive"
+ // LLVM: call void @llvm.nvvm.barrier.cluster.arrive()
+ // OGCG: call void @llvm.nvvm.barrier.cluster.arrive()
+ __nvvm_barrier_cluster_arrive();
+
+ // CIR: cir.call_llvm_intrinsic "nvvm.barrier.cluster.arrive.relaxed"
+ // LLVM: call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
+ // OGCG: call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
+ __nvvm_barrier_cluster_arrive_relaxed();
+
+ // CIR: cir.call_llvm_intrinsic "nvvm.barrier.cluster.wait"
+ // LLVM: call void @llvm.nvvm.barrier.cluster.wait()
+ // OGCG: call void @llvm.nvvm.barrier.cluster.wait()
+ __nvvm_barrier_cluster_wait();
+
+ // CIR: cir.call_llvm_intrinsic "nvvm.fence.sc.cluster"
+ // LLVM: call void @llvm.nvvm.fence.sc.cluster()
+ // OGCG: call void @llvm.nvvm.fence.sc.cluster()
+ __nvvm_fence_sc_cluster();
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/195217
More information about the llvm-branch-commits
mailing list