[llvm] [NVPTX] Support for fence.acquire and fence.release (PR #124865)
Akshay Deodhar via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 28 16:46:26 PST 2025
https://github.com/akshayrdeodhar created https://github.com/llvm/llvm-project/pull/124865
Adds codegen support for fence.acquire and fence.release, a script and generated tests for all possible legal fences, and cleans up some tablegen rules.
>From 6eea017741b5e2a88954e6c3ee18c8144c3e74a4 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Thu, 5 Dec 2024 06:36:04 +0000
Subject: [PATCH] [NVPTX] Support for fence.acquire and fence.release
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 41 ++++
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 37 +---
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 4 +
llvm/test/CodeGen/NVPTX/fence-sm-90.ll | 30 ---
llvm/test/CodeGen/NVPTX/fence-sm30.ll | 165 +++++++++++++++
llvm/test/CodeGen/NVPTX/fence-sm70.ll | 165 +++++++++++++++
llvm/test/CodeGen/NVPTX/fence-sm90.ll | 213 ++++++++++++++++++++
llvm/test/CodeGen/NVPTX/fence.ll | 102 ----------
llvm/test/CodeGen/NVPTX/fence.py | 38 ++++
llvm/test/CodeGen/NVPTX/lit.local.cfg | 1 +
10 files changed, 637 insertions(+), 159 deletions(-)
delete mode 100644 llvm/test/CodeGen/NVPTX/fence-sm-90.ll
create mode 100644 llvm/test/CodeGen/NVPTX/fence-sm30.ll
create mode 100644 llvm/test/CodeGen/NVPTX/fence-sm70.ll
create mode 100644 llvm/test/CodeGen/NVPTX/fence-sm90.ll
delete mode 100644 llvm/test/CodeGen/NVPTX/fence.ll
create mode 100644 llvm/test/CodeGen/NVPTX/fence.py
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ac8ce05724750c..ec654e0f3f200f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -648,9 +648,50 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
if (S == NVPTX::Scope::Cluster)
T->failIfClustersUnsupported(".cluster scope fence");
+ // Fall back to .acq_rel if .acquire, .release is not supported.
+ if (!T->hasSplitAcquireAndReleaseFences() &&
+ (O == NVPTX::Ordering::Acquire || O == NVPTX::Ordering::Release))
+ O = NVPTX::Ordering::AcquireRelease;
+
switch (O) {
case NVPTX::Ordering::Acquire:
+ switch (S) {
+ case NVPTX::Scope::System:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
+ : NVPTX::INT_MEMBAR_SYS;
+ case NVPTX::Scope::Block:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
+ : NVPTX::INT_MEMBAR_CTA;
+ case NVPTX::Scope::Cluster:
+ return NVPTX::atomic_thread_fence_acquire_cluster;
+ case NVPTX::Scope::Device:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
+ : NVPTX::INT_MEMBAR_GL;
+ case NVPTX::Scope::Thread:
+ report_fatal_error(
+ formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
+ ScopeToString(S)));
+ }
+ break;
case NVPTX::Ordering::Release:
+ switch (S) {
+ case NVPTX::Scope::System:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
+ : NVPTX::INT_MEMBAR_SYS;
+ case NVPTX::Scope::Block:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
+ : NVPTX::INT_MEMBAR_CTA;
+ case NVPTX::Scope::Cluster:
+ return NVPTX::atomic_thread_fence_release_cluster;
+ case NVPTX::Scope::Device:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
+ : NVPTX::INT_MEMBAR_GL;
+ case NVPTX::Scope::Thread:
+ report_fatal_error(
+ formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
+ ScopeToString(S)));
+ }
+ break;
case NVPTX::Ordering::AcquireRelease: {
switch (S) {
case NVPTX::Scope::System:
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 633a99d0fc1be3..74423d79e41e05 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3866,33 +3866,16 @@ def : Pat <
// PTX Fence instructions
////////////////////////////////////////////////////////////////////////////////
-def atomic_thread_fence_seq_cst_sys :
- NVPTXInst<(outs), (ins), "fence.sc.sys;", []>,
- Requires<[hasPTX<60>, hasSM<70>]>;
-def atomic_thread_fence_acq_rel_sys :
- NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>,
- Requires<[hasPTX<60>, hasSM<70>]>;
-
-def atomic_thread_fence_seq_cst_gpu :
- NVPTXInst<(outs), (ins), "fence.sc.gpu;", []>,
- Requires<[hasPTX<60>, hasSM<70>]>;
-def atomic_thread_fence_acq_rel_gpu :
- NVPTXInst<(outs), (ins), "fence.acq_rel.gpu;", []>,
- Requires<[hasPTX<60>, hasSM<70>]>;
-
-def atomic_thread_fence_seq_cst_cluster :
- NVPTXInst<(outs), (ins), "fence.sc.cluster;", []>,
- Requires<[hasPTX<78>, hasSM<90>]>;
-def atomic_thread_fence_acq_rel_cluster :
- NVPTXInst<(outs), (ins), "fence.acq_rel.cluster;", []>,
- Requires<[hasPTX<78>, hasSM<90>]>;
-
-def atomic_thread_fence_seq_cst_cta :
- NVPTXInst<(outs), (ins), "fence.sc.cta;", []>,
- Requires<[hasPTX<60>, hasSM<70>]>;
-def atomic_thread_fence_acq_rel_cta :
- NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
- Requires<[hasPTX<60>, hasSM<70>]>;
+class NVPTXFenceInst<string scope, string sem, Predicate ptx>:
+ NVPTXInst<(outs), (ins), "fence."#sem#"."#scope#";", []>,
+ Requires<[ptx, hasSM<70>]>;
+
+foreach scope = ["sys", "gpu", "cluster", "cta"] in {
+ def atomic_thread_fence_seq_cst_#scope: NVPTXFenceInst<scope, "sc", hasPTX<60>>;
+ def atomic_thread_fence_acq_rel_#scope: NVPTXFenceInst<scope, "acq_rel", hasPTX<60>>;
+ def atomic_thread_fence_acquire_#scope: NVPTXFenceInst<scope, "acquire", hasPTX<87>>;
+ def atomic_thread_fence_release_#scope: NVPTXFenceInst<scope, "release", hasPTX<87>>;
+}
def fpimm_any_zero : FPImmLeaf<fAny, [{
return Imm.isZero();
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 919f487c701416..990ad3c62367fd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -88,6 +88,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
// Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
// release, acq_rel, sc) ?
bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
+ // Does SM & PTX support .acquire and .release qualifiers for fence?
+ bool hasSplitAcquireAndReleaseFences() const {
+ return SmVersion >= 90 && PTXVersion >= 86;
+ }
// Does SM & PTX support atomic relaxed MMIO operations ?
bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
bool hasDotInstructions() const {
diff --git a/llvm/test/CodeGen/NVPTX/fence-sm-90.ll b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll
deleted file mode 100644
index dce39bf3e1e3ed..00000000000000
--- a/llvm/test/CodeGen/NVPTX/fence-sm-90.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s
-; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
-
-; CHECK-LABEL: fence_sc_cluster
-define void @fence_sc_cluster() local_unnamed_addr {
- ; CHECK: fence.sc.cluster
- fence syncscope("cluster") seq_cst
- ret void
-}
-
-; CHECK-LABEL: fence_acq_rel_cluster
-define void @fence_acq_rel_cluster() local_unnamed_addr {
- ; CHECK: fence.acq_rel.cluster
- fence syncscope("cluster") acq_rel
- ret void
-}
-
-; CHECK-LABEL: fence_release_cluster
-define void @fence_release_cluster() local_unnamed_addr {
- ; CHECK: fence.acq_rel.cluster
- fence syncscope("cluster") release
- ret void
-}
-
-; CHECK-LABEL: fence_acquire_cluster
-define void @fence_acquire_cluster() local_unnamed_addr {
- ; CHECK: fence.acq_rel.cluster
- fence syncscope("cluster") acquire
- ret void
-}
diff --git a/llvm/test/CodeGen/NVPTX/fence-sm30.ll b/llvm/test/CodeGen/NVPTX/fence-sm30.ll
new file mode 100644
index 00000000000000..16365db21d5b9c
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-sm30.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx50 | FileCheck %s --check-prefix=SM30
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx50 | %ptxas-verfy %}
+
+
+define void @fence_acquire_() {
+; SM30-LABEL: fence_acquire_(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: ret;
+ fence syncscope("") acquire
+ ret void
+}
+
+
+define void @fence_acquire_block() {
+; SM30-LABEL: fence_acquire_block(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.cta;
+; SM30-NEXT: ret;
+ fence syncscope("block") acquire
+ ret void
+}
+
+; .cluster scope unsupported on SM = 30 PTX = 50
+
+define void @fence_acquire_device() {
+; SM30-LABEL: fence_acquire_device(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.gl;
+; SM30-NEXT: ret;
+ fence syncscope("device") acquire
+ ret void
+}
+
+
+define void @fence_release_() {
+; SM30-LABEL: fence_release_(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: ret;
+ fence syncscope("") release
+ ret void
+}
+
+
+define void @fence_release_block() {
+; SM30-LABEL: fence_release_block(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.cta;
+; SM30-NEXT: ret;
+ fence syncscope("block") release
+ ret void
+}
+
+; .cluster scope unsupported on SM = 30 PTX = 50
+
+define void @fence_release_device() {
+; SM30-LABEL: fence_release_device(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.gl;
+; SM30-NEXT: ret;
+ fence syncscope("device") release
+ ret void
+}
+
+
+define void @fence_acq_rel_() {
+; SM30-LABEL: fence_acq_rel_(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: ret;
+ fence syncscope("") acq_rel
+ ret void
+}
+
+
+define void @fence_acq_rel_block() {
+; SM30-LABEL: fence_acq_rel_block(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.cta;
+; SM30-NEXT: ret;
+ fence syncscope("block") acq_rel
+ ret void
+}
+
+; .cluster scope unsupported on SM = 30 PTX = 50
+
+define void @fence_acq_rel_device() {
+; SM30-LABEL: fence_acq_rel_device(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.gl;
+; SM30-NEXT: ret;
+ fence syncscope("device") acq_rel
+ ret void
+}
+
+
+define void @fence_seq_cst_() {
+; SM30-LABEL: fence_seq_cst_(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: ret;
+ fence syncscope("") seq_cst
+ ret void
+}
+
+
+define void @fence_seq_cst_block() {
+; SM30-LABEL: fence_seq_cst_block(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.cta;
+; SM30-NEXT: ret;
+ fence syncscope("block") seq_cst
+ ret void
+}
+
+; .cluster scope unsupported on SM = 30 PTX = 50
+
+define void @fence_seq_cst_device() {
+; SM30-LABEL: fence_seq_cst_device(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.gl;
+; SM30-NEXT: ret;
+ fence syncscope("device") seq_cst
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/fence-sm70.ll b/llvm/test/CodeGen/NVPTX/fence-sm70.ll
new file mode 100644
index 00000000000000..085529571e0443
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-sm70.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verfy %}
+
+
+define void @fence_acquire_() {
+; SM70-LABEL: fence_acquire_(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ret;
+ fence syncscope("") acquire
+ ret void
+}
+
+
+define void @fence_acquire_block() {
+; SM70-LABEL: fence_acquire_block(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ret;
+ fence syncscope("block") acquire
+ ret void
+}
+
+; .cluster scope unsupported on SM = 70 PTX = 60
+
+define void @fence_acquire_device() {
+; SM70-LABEL: fence_acquire_device(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ret;
+ fence syncscope("device") acquire
+ ret void
+}
+
+
+define void @fence_release_() {
+; SM70-LABEL: fence_release_(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ret;
+ fence syncscope("") release
+ ret void
+}
+
+
+define void @fence_release_block() {
+; SM70-LABEL: fence_release_block(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ret;
+ fence syncscope("block") release
+ ret void
+}
+
+; .cluster scope unsupported on SM = 70 PTX = 60
+
+define void @fence_release_device() {
+; SM70-LABEL: fence_release_device(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ret;
+ fence syncscope("device") release
+ ret void
+}
+
+
+define void @fence_acq_rel_() {
+; SM70-LABEL: fence_acq_rel_(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ret;
+ fence syncscope("") acq_rel
+ ret void
+}
+
+
+define void @fence_acq_rel_block() {
+; SM70-LABEL: fence_acq_rel_block(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ret;
+ fence syncscope("block") acq_rel
+ ret void
+}
+
+; .cluster scope unsupported on SM = 70 PTX = 60
+
+define void @fence_acq_rel_device() {
+; SM70-LABEL: fence_acq_rel_device(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ret;
+ fence syncscope("device") acq_rel
+ ret void
+}
+
+
+define void @fence_seq_cst_() {
+; SM70-LABEL: fence_seq_cst_(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ret;
+ fence syncscope("") seq_cst
+ ret void
+}
+
+
+define void @fence_seq_cst_block() {
+; SM70-LABEL: fence_seq_cst_block(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ret;
+ fence syncscope("block") seq_cst
+ ret void
+}
+
+; .cluster scope unsupported on SM = 70 PTX = 60
+
+define void @fence_seq_cst_device() {
+; SM70-LABEL: fence_seq_cst_device(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ret;
+ fence syncscope("device") seq_cst
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/fence-sm90.ll b/llvm/test/CodeGen/NVPTX/fence-sm90.ll
new file mode 100644
index 00000000000000..6c1959d34df4e5
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-sm90.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verfy %}
+
+
+define void @fence_acquire_() {
+; SM90-LABEL: fence_acquire_(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: ret;
+ fence syncscope("") acquire
+ ret void
+}
+
+
+define void @fence_acquire_block() {
+; SM90-LABEL: fence_acquire_block(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: ret;
+ fence syncscope("block") acquire
+ ret void
+}
+
+
+define void @fence_acquire_cluster() {
+; SM90-LABEL: fence_acquire_cluster(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: ret;
+ fence syncscope("cluster") acquire
+ ret void
+}
+
+
+define void @fence_acquire_device() {
+; SM90-LABEL: fence_acquire_device(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: ret;
+ fence syncscope("device") acquire
+ ret void
+}
+
+
+define void @fence_release_() {
+; SM90-LABEL: fence_release_(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ret;
+ fence syncscope("") release
+ ret void
+}
+
+
+define void @fence_release_block() {
+; SM90-LABEL: fence_release_block(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ret;
+ fence syncscope("block") release
+ ret void
+}
+
+
+define void @fence_release_cluster() {
+; SM90-LABEL: fence_release_cluster(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ret;
+ fence syncscope("cluster") release
+ ret void
+}
+
+
+define void @fence_release_device() {
+; SM90-LABEL: fence_release_device(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ret;
+ fence syncscope("device") release
+ ret void
+}
+
+
+define void @fence_acq_rel_() {
+; SM90-LABEL: fence_acq_rel_(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acq_rel.sys;
+; SM90-NEXT: ret;
+ fence syncscope("") acq_rel
+ ret void
+}
+
+
+define void @fence_acq_rel_block() {
+; SM90-LABEL: fence_acq_rel_block(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acq_rel.cta;
+; SM90-NEXT: ret;
+ fence syncscope("block") acq_rel
+ ret void
+}
+
+
+define void @fence_acq_rel_cluster() {
+; SM90-LABEL: fence_acq_rel_cluster(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acq_rel.cluster;
+; SM90-NEXT: ret;
+ fence syncscope("cluster") acq_rel
+ ret void
+}
+
+
+define void @fence_acq_rel_device() {
+; SM90-LABEL: fence_acq_rel_device(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acq_rel.gpu;
+; SM90-NEXT: ret;
+ fence syncscope("device") acq_rel
+ ret void
+}
+
+
+define void @fence_seq_cst_() {
+; SM90-LABEL: fence_seq_cst_(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ret;
+ fence syncscope("") seq_cst
+ ret void
+}
+
+
+define void @fence_seq_cst_block() {
+; SM90-LABEL: fence_seq_cst_block(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ret;
+ fence syncscope("block") seq_cst
+ ret void
+}
+
+
+define void @fence_seq_cst_cluster() {
+; SM90-LABEL: fence_seq_cst_cluster(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ret;
+ fence syncscope("cluster") seq_cst
+ ret void
+}
+
+
+define void @fence_seq_cst_device() {
+; SM90-LABEL: fence_seq_cst_device(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ret;
+ fence syncscope("device") seq_cst
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/fence.ll b/llvm/test/CodeGen/NVPTX/fence.ll
deleted file mode 100644
index e094ddf5775a63..00000000000000
--- a/llvm/test/CodeGen/NVPTX/fence.ll
+++ /dev/null
@@ -1,102 +0,0 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=SM60
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70
-; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %}
-
-; TODO: implement and test thread scope.
-
-; CHECK-LABEL: fence_sc_sys
-define void @fence_sc_sys() local_unnamed_addr {
- ; SM60: membar.sys
- ; SM70: fence.sc.sys
- fence seq_cst
- ret void
-}
-
-; CHECK-LABEL: fence_acq_rel_sys
-define void @fence_acq_rel_sys() local_unnamed_addr {
- ; SM60: membar.sys
- ; SM70: fence.acq_rel.sys
- fence acq_rel
- ret void
-}
-
-; CHECK-LABEL: fence_release_sys
-define void @fence_release_sys() local_unnamed_addr {
- ; SM60: membar.sys
- ; SM70: fence.acq_rel.sys
- fence release
- ret void
-}
-
-; CHECK-LABEL: fence_acquire_sys
-define void @fence_acquire_sys() local_unnamed_addr {
- ; SM60: membar.sys
- ; SM70: fence.acq_rel.sys
- fence acquire
- ret void
-}
-
-; CHECK-LABEL: fence_sc_gpu
-define void @fence_sc_gpu() local_unnamed_addr {
- ; SM60: membar.gl
- ; SM70: fence.sc.gpu
- fence syncscope("device") seq_cst
- ret void
-}
-
-; CHECK-LABEL: fence_acq_rel_gpu
-define void @fence_acq_rel_gpu() local_unnamed_addr {
- ; SM60: membar.gl
- ; SM70: fence.acq_rel.gpu
- fence syncscope("device") acq_rel
- ret void
-}
-
-; CHECK-LABEL: fence_release_gpu
-define void @fence_release_gpu() local_unnamed_addr {
- ; SM60: membar.gl
- ; SM70: fence.acq_rel.gpu
- fence syncscope("device") release
- ret void
-}
-
-; CHECK-LABEL: fence_acquire_gpu
-define void @fence_acquire_gpu() local_unnamed_addr {
- ; SM60: membar.gl
- ; SM70: fence.acq_rel.gpu
- fence syncscope("device") acquire
- ret void
-}
-
-; CHECK-LABEL: fence_sc_cta
-define void @fence_sc_cta() local_unnamed_addr {
- ; SM60: membar.cta
- ; SM70: fence.sc.cta
- fence syncscope("block") seq_cst
- ret void
-}
-
-; CHECK-LABEL: fence_acq_rel_cta
-define void @fence_acq_rel_cta() local_unnamed_addr {
- ; SM60: membar.cta
- ; SM70: fence.acq_rel.cta
- fence syncscope("block") acq_rel
- ret void
-}
-
-; CHECK-LABEL: fence_release_cta
-define void @fence_release_cta() local_unnamed_addr {
- ; SM60: membar.cta
- ; SM70: fence.acq_rel.cta
- fence syncscope("block") release
- ret void
-}
-
-; CHECK-LABEL: fence_acquire_cta
-define void @fence_acquire_cta() local_unnamed_addr {
- ; SM60: membar.cta
- ; SM70: fence.acq_rel.cta
- fence syncscope("block") acquire
- ret void
-}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/NVPTX/fence.py b/llvm/test/CodeGen/NVPTX/fence.py
new file mode 100644
index 00000000000000..529ef78db8089b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence.py
@@ -0,0 +1,38 @@
+# For manual usage, not as a part of lit tests. Used for generating the following tests:
+# fence-sm30.ll, fence-sm70.ll, fence-sm90.ll
+
+from string import Template
+from itertools import product
+
+fence_func = Template(
+"""
+define void @fence_${ordering}_${scope}() {
+ fence syncscope(\"${scope}\") ${ordering}
+ ret void
+}
+"""
+)
+
+run_statement = Template(
+"""
+; ${run}: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm}
+; ${run}: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verfy %}
+"""
+)
+
+# (sm, ptx)
+TESTS = [(30, 50), (70, 60), (90, 87)]
+
+SCOPES = ["", "block", "cluster", "device"]
+
+ORDERINGS = ["acquire", "release", "acq_rel", "seq_cst"]
+
+if __name__ == "__main__":
+ for sm, ptx in TESTS:
+ with open ("fence-sm{}.ll".format(sm), "w") as fp:
+ print(run_statement.substitute(run = "RUN", sm = sm, ptx = ptx), file = fp)
+ for ordering, scope in product(ORDERINGS, SCOPES):
+ if scope == "cluster" and (sm < 90 or ptx < 78):
+ print("; .cluster scope unsupported on SM = {} PTX = {}".format(sm, ptx), file = fp)
+ else:
+ print(fence_func.substitute(scope = scope, ordering = ordering), file = fp)
diff --git a/llvm/test/CodeGen/NVPTX/lit.local.cfg b/llvm/test/CodeGen/NVPTX/lit.local.cfg
index e3f06d1a720e3b..54a6c338bdf85a 100644
--- a/llvm/test/CodeGen/NVPTX/lit.local.cfg
+++ b/llvm/test/CodeGen/NVPTX/lit.local.cfg
@@ -1,3 +1,4 @@
if not "NVPTX" in config.root.targets:
config.unsupported = True
config.suffixes.add(".py")
+config.excludes = ["fence.py"]
More information about the llvm-commits
mailing list