[llvm] [NVPTX] Add family-specific architectures support (PR #141899)

Thu May 29 04:49:57 PDT 2025

https://github.com/rajatbajpai updated https://github.com/llvm/llvm-project/pull/141899

>From 25e69b66efb4608cc9ab3df62a2235e458d3afa8 Mon Sep 17 00:00:00 2001
From: rbajpai <rbajpai at nvidia.com>
Date: Wed, 28 May 2025 16:48:44 +0530
Subject: [PATCH 1/2] [NVPTX] Add family-specific architectures support

This change adds family-specific architectures support. These
architectures have "f" suffix. For example, sm_100f.

This change doesn't promote existing features to family-specific
architecture.
---
 llvm/lib/Target/NVPTX/NVPTX.td         | 19 ++++++++++++------
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 27 ++++++++++++++++++++++----
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index ff9a187ecf723..3ed2553fa4232 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -41,12 +41,14 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
 
 // Arch-specific targets. PTX for these is not compatible with any other
 // architectures.
-def SM90a : FeatureSM<"90a", 901>;
-def SM100a: FeatureSM<"100a", 1001>;
-def SM101a: FeatureSM<"101a", 1011>;
-def SM103a: FeatureSM<"103a", 1031>;
-def SM120a: FeatureSM<"120a", 1201>;
-def SM121a: FeatureSM<"121a", 1211>;
+foreach sm = [90, 100, 101, 103, 120, 121] in {
+  def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 10), 1)>;
+}
+
+// Family-specific targets. PTX for these is compatible within the same family.
+foreach sm = [100, 101, 103, 120, 121] in {
+  def SM#sm#f : FeatureSM<""#sm#"f", !add(!mul(sm, 10), 2)>;
+}
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
                    70, 71, 72, 73, 74, 75, 76, 77, 78,
@@ -83,14 +85,19 @@ def : Proc<"sm_90",   [SM90, PTX78]>;
 def : Proc<"sm_90a",  [SM90a, PTX80]>;
 def : Proc<"sm_100",  [SM100, PTX86]>;
 def : Proc<"sm_100a", [SM100a, PTX86]>;
+def : Proc<"sm_100f", [SM100f, PTX88]>;
 def : Proc<"sm_101",  [SM101, PTX86]>;
 def : Proc<"sm_101a", [SM101a, PTX86]>;
+def : Proc<"sm_101f", [SM101f, PTX88]>;
 def : Proc<"sm_103",  [SM103, PTX88]>;
 def : Proc<"sm_103a", [SM103a, PTX88]>;
+def : Proc<"sm_103f", [SM103f, PTX88]>;
 def : Proc<"sm_120",  [SM120, PTX87]>;
 def : Proc<"sm_120a", [SM120a, PTX87]>;
+def : Proc<"sm_120f", [SM120f, PTX88]>;
 def : Proc<"sm_121",  [SM121, PTX88]>;
 def : Proc<"sm_121a", [SM121a, PTX88]>;
+def : Proc<"sm_121f", [SM121f, PTX88]>;
 
 def NVPTXInstrInfo : InstrInfo {
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 5136b1ee28502..5e4ab9476cb31 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -132,10 +132,29 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // are supported on the specified architecture only, hence such targets do not
   // follow the onion layer model. hasArchAccelFeatures() allows
   // distinguishing such GPU variants from the base GPU architecture.
-  // - 0 represents base GPU model,
-  // - non-zero value identifies particular architecture-accelerated variant.
-  bool hasArchAccelFeatures() const { return getFullSmVersion() % 10; }
-
+  // - false represents non-accelerated architecture.
+  // - true represents architecture-accelerated variant.
+  bool hasArchAccelFeatures() const {
+    auto FullSMVersionMod = getFullSmVersion() % 10;
+    assert(FullSMVersionMod < 3 && "Invalid architecture!");
+    return FullSMVersionMod == 1;
+  }
+  // GPUs with 'f' suffix have architecture-accelerated features which are
+  // portable across all future architectures under same SM major. For example,
+  // sm_100f features will work for sm_10X future architectures.
+  // - false represents non-family-specific architecture.
+  // - true represents family-specific variant.
+  bool hasFamilySpecificFeatures() const {
+    auto FullSMVersionMod = getFullSmVersion() % 10;
+    assert(FullSMVersionMod < 3 && "Invalid architecture!");
+    return FullSMVersionMod == 2 && PTXVersion >= 88;
+  }
+  // Checks if architecture is accelerated or family-specific.
+  // - false represents neither arch-accelerated nor family-specific arch.
+  // - true represents either arch-accelerated or family-specific arch.
+  bool hasArchAccelOrFamilySpecificFeatures() const {
+    return hasArchAccelFeatures() || hasFamilySpecificFeatures();
+  }
   // If the user did not provide a target we default to the `sm_30` target.
   std::string getTargetName() const {
     return TargetName.empty() ? "sm_30" : TargetName;

>From 203c577aa185fb2de43590826a9373d421812f16 Mon Sep 17 00:00:00 2001
From: rbajpai <rbajpai at nvidia.com>
Date: Thu, 29 May 2025 17:18:48 +0530
Subject: [PATCH 2/2] Addressed review comments.

---
 llvm/lib/Target/NVPTX/NVPTX.td        | 17 +++++++++--------
 llvm/test/CodeGen/NVPTX/sm-version.ll | 20 ++++++++++++++++++++
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 3ed2553fa4232..4d5a36e84a2d7 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -36,18 +36,19 @@ class FeaturePTX<int version>:
 
 foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
               60, 61, 62, 70, 72, 75, 80, 86, 87,
-              89, 90, 100, 101, 103, 120, 121] in
+              89, 90] in
   def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
 
-// Arch-specific targets. PTX for these is not compatible with any other
-// architectures.
-foreach sm = [90, 100, 101, 103, 120, 121] in {
-  def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 10), 1)>;
-}
+// Full SM version for sm_90a is 901
+def SM90a: FeatureSM<"90a", 901>;
 
-// Family-specific targets. PTX for these is compatible within the same family.
 foreach sm = [100, 101, 103, 120, 121] in {
-  def SM#sm#f : FeatureSM<""#sm#"f", !add(!mul(sm, 10), 2)>;
+  def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
+  // Arch-specific targets. PTX for these is not compatible with any other
+  // architectures.
+  def SM#sm#a: FeatureSM<""#sm#"a", !add(!mul(sm, 10), 1)>;
+  // Family-specific targets. PTX for these is compatible within the same family.
+  def SM#sm#f: FeatureSM<""#sm#"f", !add(!mul(sm, 10), 2)>;
 }
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
diff --git a/llvm/test/CodeGen/NVPTX/sm-version.ll b/llvm/test/CodeGen/NVPTX/sm-version.ll
index 9705a2f3ba730..3a154a1b9ac9c 100644
--- a/llvm/test/CodeGen/NVPTX/sm-version.ll
+++ b/llvm/test/CodeGen/NVPTX/sm-version.ll
@@ -18,14 +18,19 @@
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90a | FileCheck %s --check-prefix=SM90a
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_100 | FileCheck %s --check-prefix=SM100
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_100a | FileCheck %s --check-prefix=SM100a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_100f | FileCheck %s --check-prefix=SM100f
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_101 | FileCheck %s --check-prefix=SM101
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_101a | FileCheck %s --check-prefix=SM101a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_101f | FileCheck %s --check-prefix=SM101f
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103 | FileCheck %s --check-prefix=SM103
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103a | FileCheck %s --check-prefix=SM103a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103f | FileCheck %s --check-prefix=SM103f
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120 | FileCheck %s --check-prefix=SM120
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120a | FileCheck %s --check-prefix=SM120a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120f | FileCheck %s --check-prefix=SM120f
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_121 | FileCheck %s --check-prefix=SM121
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_121a | FileCheck %s --check-prefix=SM121a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_121f | FileCheck %s --check-prefix=SM121f
 
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_21 | FileCheck %s --check-prefix=SM21
@@ -47,14 +52,19 @@
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90a | FileCheck %s --check-prefix=SM90a
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefix=SM100
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a | FileCheck %s --check-prefix=SM100a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f | FileCheck %s --check-prefix=SM100f
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101 | FileCheck %s --check-prefix=SM101
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101a | FileCheck %s --check-prefix=SM101a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101f | FileCheck %s --check-prefix=SM101f
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103 | FileCheck %s --check-prefix=SM103
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a | FileCheck %s --check-prefix=SM103a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103f | FileCheck %s --check-prefix=SM103f
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120 | FileCheck %s --check-prefix=SM120
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a | FileCheck %s --check-prefix=SM120a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120f | FileCheck %s --check-prefix=SM120f
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_121 | FileCheck %s --check-prefix=SM121
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_121a | FileCheck %s --check-prefix=SM121a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_121f | FileCheck %s --check-prefix=SM121f
 
 ; SM20: .version 3.2
 ; SM21: .version 3.2
@@ -76,14 +86,19 @@
 ; SM90a: .version 8.0
 ; SM100: .version 8.6
 ; SM100a: .version 8.6
+; SM100f: .version 8.8
 ; SM101: .version 8.6
 ; SM101a: .version 8.6
+; SM101f: .version 8.8
 ; SM103: .version 8.8
 ; SM103a: .version 8.8
+; SM103f: .version 8.8
 ; SM120: .version 8.7
 ; SM120a: .version 8.7
+; SM120f: .version 8.8
 ; SM121: .version 8.8
 ; SM121a: .version 8.8
+; SM121f: .version 8.8
 
 ; SM20: .target sm_20
 ; SM21: .target sm_21
@@ -105,11 +120,16 @@
 ; SM90a: .target sm_90a
 ; SM100: .target sm_100
 ; SM100a: .target sm_100a
+; SM100f: .target sm_100f
 ; SM101: .target sm_101
 ; SM101a: .target sm_101a
+; SM101f: .target sm_101f
 ; SM103: .target sm_103
 ; SM103a: .target sm_103a
+; SM103f: .target sm_103f
 ; SM120: .target sm_120
 ; SM120a: .target sm_120a
+; SM120f: .target sm_120f
 ; SM121: .target sm_121
 ; SM121a: .target sm_121a
+; SM121f: .target sm_121f