[llvm] 9ccf038 - [NVPTX] Support for fence.acquire and fence.release (#124865)

via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 4 14:20:04 PST 2025


Author: Akshay Deodhar
Date: 2025-02-04T14:20:00-08:00
New Revision: 9ccf03861550d3bfceb828f1d1ae2210cf1eda5a

URL: https://github.com/llvm/llvm-project/commit/9ccf03861550d3bfceb828f1d1ae2210cf1eda5a
DIFF: https://github.com/llvm/llvm-project/commit/9ccf03861550d3bfceb828f1d1ae2210cf1eda5a.diff

LOG: [NVPTX] Support for fence.acquire and fence.release (#124865)

Adds codegen support for fence.acquire and fence.release, a script and
generated tests for all possible legal fences, and cleans up some
tablegen rules.

Added: 
    llvm/test/CodeGen/NVPTX/fence-cluster.ll
    llvm/test/CodeGen/NVPTX/fence-nocluster.ll
    llvm/test/CodeGen/NVPTX/fence.py

Modified: 
    llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
    llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
    llvm/lib/Target/NVPTX/NVPTXSubtarget.h
    llvm/test/CodeGen/NVPTX/lit.local.cfg

Removed: 
    llvm/test/CodeGen/NVPTX/fence-sm-90.ll
    llvm/test/CodeGen/NVPTX/fence.ll


################################################################################
diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ac8ce05724750cb..ec654e0f3f200f4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -648,9 +648,50 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
   if (S == NVPTX::Scope::Cluster)
     T->failIfClustersUnsupported(".cluster scope fence");
 
+  // Fall back to .acq_rel if .acquire, .release is not supported.
+  if (!T->hasSplitAcquireAndReleaseFences() &&
+      (O == NVPTX::Ordering::Acquire || O == NVPTX::Ordering::Release))
+    O = NVPTX::Ordering::AcquireRelease;
+
   switch (O) {
   case NVPTX::Ordering::Acquire:
+    switch (S) {
+    case NVPTX::Scope::System:
+      return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
+                                    : NVPTX::INT_MEMBAR_SYS;
+    case NVPTX::Scope::Block:
+      return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
+                                    : NVPTX::INT_MEMBAR_CTA;
+    case NVPTX::Scope::Cluster:
+      return NVPTX::atomic_thread_fence_acquire_cluster;
+    case NVPTX::Scope::Device:
+      return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
+                                    : NVPTX::INT_MEMBAR_GL;
+    case NVPTX::Scope::Thread:
+      report_fatal_error(
+          formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
+                  ScopeToString(S)));
+    }
+    break;
   case NVPTX::Ordering::Release:
+    switch (S) {
+    case NVPTX::Scope::System:
+      return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
+                                    : NVPTX::INT_MEMBAR_SYS;
+    case NVPTX::Scope::Block:
+      return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
+                                    : NVPTX::INT_MEMBAR_CTA;
+    case NVPTX::Scope::Cluster:
+      return NVPTX::atomic_thread_fence_release_cluster;
+    case NVPTX::Scope::Device:
+      return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
+                                    : NVPTX::INT_MEMBAR_GL;
+    case NVPTX::Scope::Thread:
+      report_fatal_error(
+          formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
+                  ScopeToString(S)));
+    }
+    break;
   case NVPTX::Ordering::AcquireRelease: {
     switch (S) {
     case NVPTX::Scope::System:

diff  --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 74def43d825665b..f94d549e24456cd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3867,33 +3867,16 @@ def : Pat <
 // PTX Fence instructions
 ////////////////////////////////////////////////////////////////////////////////
 
-def atomic_thread_fence_seq_cst_sys :
-  NVPTXInst<(outs), (ins), "fence.sc.sys;", []>,
-  Requires<[hasPTX<60>, hasSM<70>]>;
-def atomic_thread_fence_acq_rel_sys :
-  NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>,
-  Requires<[hasPTX<60>, hasSM<70>]>;
-
-def atomic_thread_fence_seq_cst_gpu :
-  NVPTXInst<(outs), (ins), "fence.sc.gpu;", []>,
-  Requires<[hasPTX<60>, hasSM<70>]>;
-def atomic_thread_fence_acq_rel_gpu :
-  NVPTXInst<(outs), (ins), "fence.acq_rel.gpu;", []>,
-  Requires<[hasPTX<60>, hasSM<70>]>;
-
-def atomic_thread_fence_seq_cst_cluster :
-  NVPTXInst<(outs), (ins), "fence.sc.cluster;", []>,
-  Requires<[hasPTX<78>, hasSM<90>]>;
-def atomic_thread_fence_acq_rel_cluster :
-  NVPTXInst<(outs), (ins), "fence.acq_rel.cluster;", []>,
-  Requires<[hasPTX<78>, hasSM<90>]>;
-
-def atomic_thread_fence_seq_cst_cta :
-  NVPTXInst<(outs), (ins), "fence.sc.cta;", []>,
-  Requires<[hasPTX<60>, hasSM<70>]>;
-def atomic_thread_fence_acq_rel_cta :
-  NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
-  Requires<[hasPTX<60>, hasSM<70>]>;
+class NVPTXFenceInst<string scope, string sem, Predicate ptx>:
+    NVPTXInst<(outs), (ins), "fence."#sem#"."#scope#";", []>,
+    Requires<[ptx, hasSM<70>]>;
+
+foreach scope = ["sys", "gpu", "cluster", "cta"] in {
+  def atomic_thread_fence_seq_cst_#scope: NVPTXFenceInst<scope, "sc", hasPTX<60>>;
+  def atomic_thread_fence_acq_rel_#scope: NVPTXFenceInst<scope, "acq_rel", hasPTX<60>>;
+  def atomic_thread_fence_acquire_#scope: NVPTXFenceInst<scope, "acquire", hasPTX<87>>;
+  def atomic_thread_fence_release_#scope: NVPTXFenceInst<scope, "release", hasPTX<87>>;
+}
 
 def fpimm_any_zero : FPImmLeaf<fAny, [{
   return Imm.isZero();

diff  --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 0c4420b085dc9a0..851c9152e4cb8ff 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -88,6 +88,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
   // release, acq_rel, sc) ?
   bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
+  // Does SM & PTX support .acquire and .release qualifiers for fence?
+  bool hasSplitAcquireAndReleaseFences() const {
+    return SmVersion >= 90 && PTXVersion >= 86;
+  }
   // Does SM & PTX support atomic relaxed MMIO operations ?
   bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
   bool hasDotInstructions() const {

diff  --git a/llvm/test/CodeGen/NVPTX/fence-cluster.ll b/llvm/test/CodeGen/NVPTX/fence-cluster.ll
new file mode 100644
index 000000000000000..697dce4f89515ab
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-cluster.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify %}
+
+define void @fence_acquire_cluster() {
+; SM90-LABEL: fence_acquire_cluster(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.acquire.cluster;
+; SM90-NEXT:    ret;
+    fence syncscope("cluster") acquire
+    ret void
+}
+
+
+define void @fence_release_cluster() {
+; SM90-LABEL: fence_release_cluster(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.release.cluster;
+; SM90-NEXT:    ret;
+    fence syncscope("cluster") release
+    ret void
+}
+
+
+define void @fence_acq_rel_cluster() {
+; SM90-LABEL: fence_acq_rel_cluster(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.acq_rel.cluster;
+; SM90-NEXT:    ret;
+    fence syncscope("cluster") acq_rel
+    ret void
+}
+
+
+define void @fence_seq_cst_cluster() {
+; SM90-LABEL: fence_seq_cst_cluster(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.sc.cluster;
+; SM90-NEXT:    ret;
+    fence syncscope("cluster") seq_cst
+    ret void
+}
+

diff  --git a/llvm/test/CodeGen/NVPTX/fence-nocluster.ll b/llvm/test/CodeGen/NVPTX/fence-nocluster.ll
new file mode 100644
index 000000000000000..e2bec72517d550e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-nocluster.ll
@@ -0,0 +1,355 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx50 | FileCheck %s --check-prefix=SM30
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx50 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify %}
+
+define void @fence_acquire_sys() {
+; SM30-LABEL: fence_acquire_sys(
+; SM30:       {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: fence_acquire_sys(
+; SM70:       {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ret;
+;
+; SM90-LABEL: fence_acquire_sys(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    ret;
+    fence syncscope("") acquire
+    ret void
+}
+
+
+define void @fence_acquire_cta() {
+; SM30-LABEL: fence_acquire_cta(
+; SM30:       {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    membar.cta;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: fence_acquire_cta(
+; SM70:       {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ret;
+;
+; SM90-LABEL: fence_acquire_cta(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.acquire.cta;
+; SM90-NEXT:    ret;
+    fence syncscope("block") acquire
+    ret void
+}
+
+
+define void @fence_acquire_gpu() {
+; SM30-LABEL: fence_acquire_gpu(
+; SM30:       {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    membar.gl;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: fence_acquire_gpu(
+; SM70:       {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ret;
+;
+; SM90-LABEL: fence_acquire_gpu(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.acquire.gpu;
+; SM90-NEXT:    ret;
+    fence syncscope("device") acquire
+    ret void
+}
+
+
+define void @fence_release_sys() {
+; SM30-LABEL: fence_release_sys(
+; SM30:       {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: fence_release_sys(
+; SM70:       {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ret;
+;
+; SM90-LABEL: fence_release_sys(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ret;
+    fence syncscope("") release
+    ret void
+}
+
+
+define void @fence_release_cta() {
+; SM30-LABEL: fence_release_cta(
+; SM30:       {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    membar.cta;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: fence_release_cta(
+; SM70:       {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ret;
+;
+; SM90-LABEL: fence_release_cta(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.release.cta;
+; SM90-NEXT:    ret;
+    fence syncscope("block") release
+    ret void
+}
+
+
+define void @fence_release_gpu() {
+; SM30-LABEL: fence_release_gpu(
+; SM30:       {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    membar.gl;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: fence_release_gpu(
+; SM70:       {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ret;
+;
+; SM90-LABEL: fence_release_gpu(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.release.gpu;
+; SM90-NEXT:    ret;
+    fence syncscope("device") release
+    ret void
+}
+
+
+define void @fence_acq_rel_sys() {
+; SM30-LABEL: fence_acq_rel_sys(
+; SM30:       {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: fence_acq_rel_sys(
+; SM70:       {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ret;
+;
+; SM90-LABEL: fence_acq_rel_sys(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.acq_rel.sys;
+; SM90-NEXT:    ret;
+    fence syncscope("") acq_rel
+    ret void
+}
+
+
+define void @fence_acq_rel_cta() {
+; SM30-LABEL: fence_acq_rel_cta(
+; SM30:       {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    membar.cta;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: fence_acq_rel_cta(
+; SM70:       {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    fence.acq_rel.cta;
+; SM70-NEXT:    ret;
+;
+; SM90-LABEL: fence_acq_rel_cta(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.acq_rel.cta;
+; SM90-NEXT:    ret;
+    fence syncscope("block") acq_rel
+    ret void
+}
+
+
+define void @fence_acq_rel_gpu() {
+; SM30-LABEL: fence_acq_rel_gpu(
+; SM30:       {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    membar.gl;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: fence_acq_rel_gpu(
+; SM70:       {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    fence.acq_rel.gpu;
+; SM70-NEXT:    ret;
+;
+; SM90-LABEL: fence_acq_rel_gpu(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.acq_rel.gpu;
+; SM90-NEXT:    ret;
+    fence syncscope("device") acq_rel
+    ret void
+}
+
+
+define void @fence_seq_cst_sys() {
+; SM30-LABEL: fence_seq_cst_sys(
+; SM30:       {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: fence_seq_cst_sys(
+; SM70:       {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ret;
+;
+; SM90-LABEL: fence_seq_cst_sys(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ret;
+    fence syncscope("") seq_cst
+    ret void
+}
+
+
+define void @fence_seq_cst_cta() {
+; SM30-LABEL: fence_seq_cst_cta(
+; SM30:       {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    membar.cta;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: fence_seq_cst_cta(
+; SM70:       {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    fence.sc.cta;
+; SM70-NEXT:    ret;
+;
+; SM90-LABEL: fence_seq_cst_cta(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.sc.cta;
+; SM90-NEXT:    ret;
+    fence syncscope("block") seq_cst
+    ret void
+}
+
+
+define void @fence_seq_cst_gpu() {
+; SM30-LABEL: fence_seq_cst_gpu(
+; SM30:       {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    membar.gl;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: fence_seq_cst_gpu(
+; SM70:       {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    fence.sc.gpu;
+; SM70-NEXT:    ret;
+;
+; SM90-LABEL: fence_seq_cst_gpu(
+; SM90:       {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    fence.sc.gpu;
+; SM90-NEXT:    ret;
+    fence syncscope("device") seq_cst
+    ret void
+}
+

diff  --git a/llvm/test/CodeGen/NVPTX/fence-sm-90.ll b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll
deleted file mode 100644
index dce39bf3e1e3ed7..000000000000000
--- a/llvm/test/CodeGen/NVPTX/fence-sm-90.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s
-; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
-
-; CHECK-LABEL: fence_sc_cluster
-define void @fence_sc_cluster() local_unnamed_addr {
-  ; CHECK: fence.sc.cluster
-  fence syncscope("cluster") seq_cst
-  ret void
-}
-
-; CHECK-LABEL: fence_acq_rel_cluster
-define void @fence_acq_rel_cluster() local_unnamed_addr {
-  ; CHECK: fence.acq_rel.cluster
-  fence syncscope("cluster") acq_rel
-  ret void
-}
-
-; CHECK-LABEL: fence_release_cluster
-define void @fence_release_cluster() local_unnamed_addr {
-  ; CHECK: fence.acq_rel.cluster
-  fence syncscope("cluster") release
-  ret void
-}
-
-; CHECK-LABEL: fence_acquire_cluster
-define void @fence_acquire_cluster() local_unnamed_addr {
-  ; CHECK: fence.acq_rel.cluster
-  fence syncscope("cluster") acquire
-  ret void
-}

diff  --git a/llvm/test/CodeGen/NVPTX/fence.ll b/llvm/test/CodeGen/NVPTX/fence.ll
deleted file mode 100644
index e094ddf5775a639..000000000000000
--- a/llvm/test/CodeGen/NVPTX/fence.ll
+++ /dev/null
@@ -1,102 +0,0 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=SM60
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70
-; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %}
-
-; TODO: implement and test thread scope.
-
-; CHECK-LABEL: fence_sc_sys
-define void @fence_sc_sys() local_unnamed_addr {
-  ; SM60: membar.sys
-  ; SM70: fence.sc.sys
-  fence seq_cst
-  ret void
-}
-
-; CHECK-LABEL: fence_acq_rel_sys
-define void @fence_acq_rel_sys() local_unnamed_addr {
-  ; SM60: membar.sys
-  ; SM70: fence.acq_rel.sys
-  fence acq_rel
-  ret void
-}
-
-; CHECK-LABEL: fence_release_sys
-define void @fence_release_sys() local_unnamed_addr {
-  ; SM60: membar.sys
-  ; SM70: fence.acq_rel.sys
-  fence release
-  ret void
-}
-
-; CHECK-LABEL: fence_acquire_sys
-define void @fence_acquire_sys() local_unnamed_addr {
-  ; SM60: membar.sys
-  ; SM70: fence.acq_rel.sys
-  fence acquire
-  ret void
-}
-
-; CHECK-LABEL: fence_sc_gpu
-define void @fence_sc_gpu() local_unnamed_addr {
-  ; SM60: membar.gl
-  ; SM70: fence.sc.gpu
-  fence syncscope("device") seq_cst
-  ret void
-}
-
-; CHECK-LABEL: fence_acq_rel_gpu
-define void @fence_acq_rel_gpu() local_unnamed_addr {
-  ; SM60: membar.gl
-  ; SM70: fence.acq_rel.gpu
-  fence syncscope("device") acq_rel
-  ret void
-}
-
-; CHECK-LABEL: fence_release_gpu
-define void @fence_release_gpu() local_unnamed_addr {
-  ; SM60: membar.gl
-  ; SM70: fence.acq_rel.gpu
-  fence syncscope("device") release
-  ret void
-}
-
-; CHECK-LABEL: fence_acquire_gpu
-define void @fence_acquire_gpu() local_unnamed_addr {
-  ; SM60: membar.gl
-  ; SM70: fence.acq_rel.gpu
-  fence syncscope("device") acquire
-  ret void
-}
-
-; CHECK-LABEL: fence_sc_cta
-define void @fence_sc_cta() local_unnamed_addr {
-  ; SM60: membar.cta
-  ; SM70: fence.sc.cta
-  fence syncscope("block") seq_cst
-  ret void
-}
-
-; CHECK-LABEL: fence_acq_rel_cta
-define void @fence_acq_rel_cta() local_unnamed_addr {
-  ; SM60: membar.cta
-  ; SM70: fence.acq_rel.cta
-  fence syncscope("block") acq_rel
-  ret void
-}
-
-; CHECK-LABEL: fence_release_cta
-define void @fence_release_cta() local_unnamed_addr {
-  ; SM60: membar.cta
-  ; SM70: fence.acq_rel.cta
-  fence syncscope("block") release
-  ret void
-}
-
-; CHECK-LABEL: fence_acquire_cta
-define void @fence_acquire_cta() local_unnamed_addr {
-  ; SM60: membar.cta
-  ; SM70: fence.acq_rel.cta
-  fence syncscope("block") acquire
-  ret void
-}
\ No newline at end of file

diff  --git a/llvm/test/CodeGen/NVPTX/fence.py b/llvm/test/CodeGen/NVPTX/fence.py
new file mode 100644
index 000000000000000..b9f9d294e6fe866
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence.py
@@ -0,0 +1,56 @@
+# For manual usage, not as a part of lit tests. Used for generating the following tests:
+# fence-sm30.ll, fence-sm70.ll, fence-sm90.ll
+
+from string import Template
+from itertools import product
+
+fence_func = Template(
+    """
+define void @fence_${ordering}_${ptx_scope}() {
+    fence syncscope(\"${llvm_scope}\") ${ordering}
+    ret void
+}
+"""
+)
+
+run_statement = Template(
+    """; RUN: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm}
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify %}"""
+)
+
+# (sm, ptx)
+TESTS = [(30, 50), (70, 60), (90, 87)]
+
+LLVM_SCOPES_NO_CLUSTER = ["", "block", "device"]
+
+SCOPE_LLVM_TO_PTX = {"": "sys", "block": "cta", "cluster": "cluster", "device": "gpu"}
+
+ORDERINGS = ["acquire", "release", "acq_rel", "seq_cst"]
+
+if __name__ == "__main__":
+    # non-cluster orderings are supported on SM30, SM70 and SM90
+    with open("fence-nocluster.ll", "w") as fp:
+        for sm, ptx in TESTS:
+            print(run_statement.substitute(sm=sm, ptx=ptx), file=fp)
+        for ordering, llvm_scope in product(ORDERINGS, LLVM_SCOPES_NO_CLUSTER):
+            print(
+                fence_func.substitute(
+                    llvm_scope=llvm_scope,
+                    ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
+                    ordering=ordering,
+                ),
+                file=fp,
+            )
+
+    # cluster ordering only supported on SM90
+    with open("fence-cluster.ll", "w") as fp:
+        print(run_statement.substitute(sm=90, ptx=87), file=fp)
+        for ordering in ORDERINGS:
+            print(
+                fence_func.substitute(
+                    llvm_scope="cluster",
+                    ptx_scope=SCOPE_LLVM_TO_PTX["cluster"],
+                    ordering=ordering,
+                ),
+                file=fp,
+            )

diff  --git a/llvm/test/CodeGen/NVPTX/lit.local.cfg b/llvm/test/CodeGen/NVPTX/lit.local.cfg
index e3f06d1a720e3b2..54a6c338bdf85af 100644
--- a/llvm/test/CodeGen/NVPTX/lit.local.cfg
+++ b/llvm/test/CodeGen/NVPTX/lit.local.cfg
@@ -1,3 +1,4 @@
 if not "NVPTX" in config.root.targets:
     config.unsupported = True
 config.suffixes.add(".py")
+config.excludes = ["fence.py"]


        


More information about the llvm-commits mailing list