[llvm] [NVPTX] Add family-specific architectures support (PR #141899)

Wed Jun 18 00:15:35 PDT 2025

https://github.com/rajatbajpai updated https://github.com/llvm/llvm-project/pull/141899

>From c4df43ec50f023e78a23bd6c79d848fce23e18b5 Mon Sep 17 00:00:00 2001
From: rbajpai <rbajpai at nvidia.com>
Date: Wed, 28 May 2025 16:48:44 +0530
Subject: [PATCH 1/5] [NVPTX] Add family-specific architectures support

This change adds family-specific architectures support. These
architectures have "f" suffix. For example, sm_100f.

This change doesn't promote existing features to family-specific
architecture.
---
 llvm/lib/Target/NVPTX/NVPTX.td           | 68 ++++++++++++++++++++----
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td  |  8 +--
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp |  2 +-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h   | 24 ++++++---
 llvm/test/CodeGen/NVPTX/sm-version.ll    | 20 +++++++
 5 files changed, 99 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index ff9a187ecf723..84d45cde189fc 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -33,20 +33,61 @@ class FeaturePTX<int version>:
    SubtargetFeature<"ptx"# version, "PTXVersion",
                     "" # version,
                     "Use PTX version " # version>;
-
+//
+// NVPTX Architecture Hierarchy and Ordering:
+// 
+// Family: 2/3/5/6/7/8/9/10/12 (Follows Onion model, older family is compatible with newer family)
+// Arch: 2*/3*/5*/6*/7*/8*/9*/10*/12*
+//
+// Family-specific: F*f : F*f > F* =>
+// + The plain base architecture is compatible with the family-specific architecture
+//   (e.g. sm_100 compatible with >= sm_100*f*)
+// + The family-specific architecture is compatible with future family-specific
+//   architectures within the same family (e.g. sm_100f compatible with >= sm_10X*f*
+//   but not with sm_12X*f*)
+//
+//    Family and SM Target Definition:
+//    +----------------+--------------------------------------------------------+
+//    | Family         | Target SM architectures included                       |
+//    +----------------+--------------------------------------------------------+
+//    | sm_10x family  | sm_100f, sm_103f, future targets in sm_10x family      |
+//    | sm_101 family  | sm_101f (exception)                                    |
+//    | sm_12x family  | sm_120f, sm_121f, future targets in sm_12x family      |
+//    +----------------+--------------------------------------------------------+
+//
+// Architecture-specific: F*a : F*a > F*f > F* =>
+// + The plain base architecture is compatible with the architecture-specific architecture
+//   (e.g. sm_100 compatible with >= sm_100*a*)
+// + The family-specific architecture is compatible with the architecture-specific architecture
+//   (e.g. sm_100f compatible with >= sm_100*a*)
+// + The architecture-specific architecture is incompatible with any other architecture
+//   (e.g. sm_100a is only compatible with sm_100*a*)
+//
+// Encoding: Arch * 1000 + 'f' * 10 + 'a' * 1 (where 'a' ⇒ 'f')
+// 
+// This encoding allows simple implementation of the partial ordering of the architectures.
+//  + Compare Family and Arch by dividing FullSMVersion by 1000 and 100 respectively before the comparison.
+//  + Compare within the family by comparing FullSMVersion, given both belongs to the same family.
+//  + Detect 'a' variants by checking FullSMVersion % 10.
+//
 foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
               60, 61, 62, 70, 72, 75, 80, 86, 87,
-              89, 90, 100, 101, 103, 120, 121] in
-  def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
+              89, 90, 100, 101, 103, 120, 121] in {
+  // Base SM version (e.g. FullSMVersion for sm_100 is 10000)
+  def SM#sm : FeatureSM<""#sm, !mul(sm, 100)>;
 
-// Arch-specific targets. PTX for these is not compatible with any other
-// architectures.
-def SM90a : FeatureSM<"90a", 901>;
-def SM100a: FeatureSM<"100a", 1001>;
-def SM101a: FeatureSM<"101a", 1011>;
-def SM103a: FeatureSM<"103a", 1031>;
-def SM120a: FeatureSM<"120a", 1201>;
-def SM121a: FeatureSM<"121a", 1211>;
+  // Family-specific targets which are compatible within same family
+  // (e.g. FullSMVersion for sm_100f is 10010)
+  if !ge(sm, 100) then {
+    def SM#sm#f : FeatureSM<""#sm#"f", !add(!mul(sm, 100), 10)>;
+  }
+
+  // Architecture-specific targets which are incompatible across architectures
+  // (e.g. FullSMVersion for sm_100a is 10011)
+  if !ge(sm, 90) then {
+    def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 100), 11)>;
+  }
+}
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
                    70, 71, 72, 73, 74, 75, 76, 77, 78,
@@ -83,14 +124,19 @@ def : Proc<"sm_90",   [SM90, PTX78]>;
 def : Proc<"sm_90a",  [SM90a, PTX80]>;
 def : Proc<"sm_100",  [SM100, PTX86]>;
 def : Proc<"sm_100a", [SM100a, PTX86]>;
+def : Proc<"sm_100f", [SM100f, PTX88]>;
 def : Proc<"sm_101",  [SM101, PTX86]>;
 def : Proc<"sm_101a", [SM101a, PTX86]>;
+def : Proc<"sm_101f", [SM101f, PTX88]>;
 def : Proc<"sm_103",  [SM103, PTX88]>;
 def : Proc<"sm_103a", [SM103a, PTX88]>;
+def : Proc<"sm_103f", [SM103f, PTX88]>;
 def : Proc<"sm_120",  [SM120, PTX87]>;
 def : Proc<"sm_120a", [SM120a, PTX87]>;
+def : Proc<"sm_120f", [SM120f, PTX88]>;
 def : Proc<"sm_121",  [SM121, PTX88]>;
 def : Proc<"sm_121a", [SM121a, PTX88]>;
+def : Proc<"sm_121f", [SM121f, PTX88]>;
 
 def NVPTXInstrInfo : InstrInfo {
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 5dbdce52f0553..bbe99dec5c445 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -158,10 +158,10 @@ class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>
 class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
 
 // Explicit records for arch-accelerated SM versions
-def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">;
-def hasSM100a : Predicate<"Subtarget->getFullSmVersion() == 1001">;
-def hasSM101a : Predicate<"Subtarget->getFullSmVersion() == 1011">;
-def hasSM120a : Predicate<"Subtarget->getFullSmVersion() == 1201">;
+def hasSM90a : Predicate<"Subtarget->getSmVersion() == 90 && Subtarget->hasArchAccelFeatures()">;
+def hasSM100a : Predicate<"Subtarget->getSmVersion() == 100 && Subtarget->hasArchAccelFeatures()">;
+def hasSM101a : Predicate<"Subtarget->getSmVersion() == 101 && Subtarget->hasArchAccelFeatures()">;
+def hasSM120a : Predicate<"Subtarget->getSmVersion() == 120 && Subtarget->hasArchAccelFeatures()">;
 
 // non-sync shfl instructions are not available on sm_70+ in PTX6.4+
 def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index e5d680c19d921..0b2b5dfe88e00 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -55,7 +55,7 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS,
                                const NVPTXTargetMachine &TM)
     : NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0),
-      FullSmVersion(200), SmVersion(getSmVersion()),
+      FullSmVersion(2000), SmVersion(getSmVersion()),
       TLInfo(TM, initializeSubtargetDependencies(CPU, FS)) {
   TSInfo = std::make_unique<NVPTXSelectionDAGInfo>();
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index d2eae48826829..c48dc91595f2e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -108,8 +108,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
     switch (FullSmVersion) {
     default:
       break;
-    case 1001: // sm_100a
-    case 1011: // sm_101a
+    case 10011: // sm_100a
+    case 10111: // sm_101a
       HasTcgen05 = true;
       break;
     }
@@ -135,15 +135,25 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
   bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
-  unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
+  unsigned int getSmVersion() const { return getFullSmVersion() / 100; }
   // GPUs with "a" suffix have include architecture-accelerated features that
   // are supported on the specified architecture only, hence such targets do not
   // follow the onion layer model. hasArchAccelFeatures() allows
   // distinguishing such GPU variants from the base GPU architecture.
-  // - 0 represents base GPU model,
-  // - non-zero value identifies particular architecture-accelerated variant.
-  bool hasArchAccelFeatures() const { return getFullSmVersion() % 10; }
-
+  // - false represents non-accelerated architecture.
+  // - true represents architecture-accelerated variant.
+  bool hasArchAccelFeatures() const {
+    return getFullSmVersion() % 10 && PTXVersion >= 80;
+  }
+  // GPUs with 'f' suffix have architecture-accelerated features which are
+  // portable across all future architectures under same SM major. For example,
+  // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.
+  // - false represents non-family-specific architecture.
+  // - true represents family-specific variant.
+  bool hasFamilySpecificFeatures() const {
+    return getFullSmVersion() % 100 == 10 ? PTXVersion >= 88
+                                          : hasArchAccelFeatures();
+  }
   // If the user did not provide a target we default to the `sm_30` target.
   std::string getTargetName() const {
     return TargetName.empty() ? "sm_30" : TargetName;
diff --git a/llvm/test/CodeGen/NVPTX/sm-version.ll b/llvm/test/CodeGen/NVPTX/sm-version.ll
index 9705a2f3ba730..3a154a1b9ac9c 100644
--- a/llvm/test/CodeGen/NVPTX/sm-version.ll
+++ b/llvm/test/CodeGen/NVPTX/sm-version.ll
@@ -18,14 +18,19 @@
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90a | FileCheck %s --check-prefix=SM90a
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_100 | FileCheck %s --check-prefix=SM100
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_100a | FileCheck %s --check-prefix=SM100a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_100f | FileCheck %s --check-prefix=SM100f
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_101 | FileCheck %s --check-prefix=SM101
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_101a | FileCheck %s --check-prefix=SM101a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_101f | FileCheck %s --check-prefix=SM101f
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103 | FileCheck %s --check-prefix=SM103
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103a | FileCheck %s --check-prefix=SM103a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103f | FileCheck %s --check-prefix=SM103f
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120 | FileCheck %s --check-prefix=SM120
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120a | FileCheck %s --check-prefix=SM120a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120f | FileCheck %s --check-prefix=SM120f
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_121 | FileCheck %s --check-prefix=SM121
 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_121a | FileCheck %s --check-prefix=SM121a
+; RUN: llc < %s -mtriple=nvptx -mcpu=sm_121f | FileCheck %s --check-prefix=SM121f
 
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_21 | FileCheck %s --check-prefix=SM21
@@ -47,14 +52,19 @@
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90a | FileCheck %s --check-prefix=SM90a
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefix=SM100
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a | FileCheck %s --check-prefix=SM100a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f | FileCheck %s --check-prefix=SM100f
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101 | FileCheck %s --check-prefix=SM101
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101a | FileCheck %s --check-prefix=SM101a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101f | FileCheck %s --check-prefix=SM101f
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103 | FileCheck %s --check-prefix=SM103
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a | FileCheck %s --check-prefix=SM103a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103f | FileCheck %s --check-prefix=SM103f
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120 | FileCheck %s --check-prefix=SM120
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a | FileCheck %s --check-prefix=SM120a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120f | FileCheck %s --check-prefix=SM120f
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_121 | FileCheck %s --check-prefix=SM121
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_121a | FileCheck %s --check-prefix=SM121a
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_121f | FileCheck %s --check-prefix=SM121f
 
 ; SM20: .version 3.2
 ; SM21: .version 3.2
@@ -76,14 +86,19 @@
 ; SM90a: .version 8.0
 ; SM100: .version 8.6
 ; SM100a: .version 8.6
+; SM100f: .version 8.8
 ; SM101: .version 8.6
 ; SM101a: .version 8.6
+; SM101f: .version 8.8
 ; SM103: .version 8.8
 ; SM103a: .version 8.8
+; SM103f: .version 8.8
 ; SM120: .version 8.7
 ; SM120a: .version 8.7
+; SM120f: .version 8.8
 ; SM121: .version 8.8
 ; SM121a: .version 8.8
+; SM121f: .version 8.8
 
 ; SM20: .target sm_20
 ; SM21: .target sm_21
@@ -105,11 +120,16 @@
 ; SM90a: .target sm_90a
 ; SM100: .target sm_100
 ; SM100a: .target sm_100a
+; SM100f: .target sm_100f
 ; SM101: .target sm_101
 ; SM101a: .target sm_101a
+; SM101f: .target sm_101f
 ; SM103: .target sm_103
 ; SM103a: .target sm_103a
+; SM103f: .target sm_103f
 ; SM120: .target sm_120
 ; SM120a: .target sm_120a
+; SM120f: .target sm_120f
 ; SM121: .target sm_121
 ; SM121a: .target sm_121a
+; SM121f: .target sm_121f

>From 4599b9fe558e35b0042a97d194ddff70c8c1b4a6 Mon Sep 17 00:00:00 2001
From: rbajpai <rbajpai at nvidia.com>
Date: Wed, 11 Jun 2025 15:01:15 +0530
Subject: [PATCH 2/5] Updated documentation based on the recent review comments

---
 llvm/lib/Target/NVPTX/NVPTX.td | 67 ++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 84d45cde189fc..802e9873a6e20 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -33,43 +33,48 @@ class FeaturePTX<int version>:
    SubtargetFeature<"ptx"# version, "PTXVersion",
                     "" # version,
                     "Use PTX version " # version>;
-//
 // NVPTX Architecture Hierarchy and Ordering:
-// 
-// Family: 2/3/5/6/7/8/9/10/12 (Follows Onion model, older family is compatible with newer family)
-// Arch: 2*/3*/5*/6*/7*/8*/9*/10*/12*
 //
-// Family-specific: F*f : F*f > F* =>
-// + The plain base architecture is compatible with the family-specific architecture
-//   (e.g. sm_100 compatible with >= sm_100*f*)
-// + The family-specific architecture is compatible with future family-specific
-//   architectures within the same family (e.g. sm_100f compatible with >= sm_10X*f*
-//   but not with sm_12X*f*)
+// GPU architectures: sm_2Y/sm_3Y/sm_5Y/sm_6Y/sm_7Y/sm_8Y/sm_9Y/sm_10Y/sm_12Y
+// ('Y' represents version within the architecture)
+// The architectures have name of form sm_XYz where 'X' represent the generation
+// number, 'Y' represents the version within the architecture, and 'z' represents
+// the optional feature suffix.
+// If X1Y1 <= X2Y2, then GPU capabilities of sm_X1Y1 are included in sm_X2Y2.
+// For example, take sm_90 (9 represents 'X', 0 represents 'Y', and no feature
+// suffix) and sm_103 architectures (10 represents 'X', 3 represents 'Y', and no
+// feature suffix). Since 90 <= 103, sm_90 is compatible with sm_103.
 //
-//    Family and SM Target Definition:
-//    +----------------+--------------------------------------------------------+
-//    | Family         | Target SM architectures included                       |
-//    +----------------+--------------------------------------------------------+
-//    | sm_10x family  | sm_100f, sm_103f, future targets in sm_10x family      |
-//    | sm_101 family  | sm_101f (exception)                                    |
-//    | sm_12x family  | sm_120f, sm_121f, future targets in sm_12x family      |
-//    +----------------+--------------------------------------------------------+
+// The family-specific architectures have 'f' feature suffix and they follow
+// following order:
+// sm_X{Y2}f > sm_X{Y1}f iff Y2 > Y1
+// sm_XY{f} > sm_{XY}{}
 //
-// Architecture-specific: F*a : F*a > F*f > F* =>
-// + The plain base architecture is compatible with the architecture-specific architecture
-//   (e.g. sm_100 compatible with >= sm_100*a*)
-// + The family-specific architecture is compatible with the architecture-specific architecture
-//   (e.g. sm_100f compatible with >= sm_100*a*)
-// + The architecture-specific architecture is incompatible with any other architecture
-//   (e.g. sm_100a is only compatible with sm_100*a*)
+// For example, take sm_100f (10 represents 'X', 0 represents 'Y', and 'f'
+// represents 'z') and sm_103f (10 represents 'X', 3 represents 'Y', and 'f'
+// represents 'z') architectures. Since Y1 < Y2, sm_100f is compatible with
+// sm_103f. Similarly based on the second rule, sm_90 is compatible with sm_103f.
 //
-// Encoding: Arch * 1000 + 'f' * 10 + 'a' * 1 (where 'a' ⇒ 'f')
-// 
-// This encoding allows simple implementation of the partial ordering of the architectures.
-//  + Compare Family and Arch by dividing FullSMVersion by 1000 and 100 respectively before the comparison.
-//  + Compare within the family by comparing FullSMVersion, given both belongs to the same family.
-//  + Detect 'a' variants by checking FullSMVersion % 10.
+// The architecture-specific architectures have 'a' feature suffix and they follow
+// following order:
+// sm_XY{a} > sm_XY{f} > sm_{XY}{}
+//
+// For example, take sm_103a (10 represents 'X', 3 represents 'Y', and 'a'
+// represents 'z'), sm_103f, and sm_103 architectures. The sm_103 is compatible
+// with sm_103a and sm_103f, and sm_103f is compatible with sm_103a.
 //
+// Encoding := Arch * 100 + 10 (for 'f') + 1 (for 'a')
+// Arch := X * 10 + Y
+//
+// For example, sm_103a is encoded as 10311 (103 * 100 + 10 + 1) and sm_103f is
+// encoded as 10310 (103 * 100 + 10).
+//
+// This encoding allows simple partial ordering of the architectures.
+//  + Compare Family and Arch by dividing FullSMVersion by 1000 and 100
+//    respectively before the comparison.
+//  + Compare within the family by comparing FullSMVersion, given both belongs to
+//    the same family.
+//  + Detect 'a' variants by checking FullSMVersion % 10.
 foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
               60, 61, 62, 70, 72, 75, 80, 86, 87,
               89, 90, 100, 101, 103, 120, 121] in {

>From f38ba43dff8e0fae0b407331758669b215840beb Mon Sep 17 00:00:00 2001
From: rbajpai <rbajpai at nvidia.com>
Date: Wed, 11 Jun 2025 15:01:15 +0530
Subject: [PATCH 3/5] Updated documentation based on the recent review comments

---
 llvm/docs/NVPTXUsage.rst       | 52 +++++++++++++++++++++++++++++++++-
 llvm/lib/Target/NVPTX/NVPTX.td | 21 ++++++++------
 2 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index abd7ca5453645..e89ac9be54738 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -147,7 +147,57 @@ Example: 32-bit PTX for CUDA Driver API: ``nvptx-nvidia-cuda``
 
 Example: 64-bit PTX for CUDA Driver API: ``nvptx64-nvidia-cuda``
 
-
+.. _nvptx_arch_hierarchy:
+
+NVPTX Architecture Hierarchy and Ordering
+========================================
+
+GPU architectures: sm_2Y/sm_3Y/sm_5Y/sm_6Y/sm_7Y/sm_8Y/sm_9Y/sm_10Y/sm_12Y
+('Y' represents version within the architecture)
+The architectures have name of form ``sm_XYz`` where ``X`` represent the generation
+number, ``Y`` represents the version within the architecture, and ``z`` represents
+the optional feature suffix.
+If ``X1Y1 <= X2Y2``, then GPU capabilities of ``sm_X1Y1`` are included in ``sm_X2Y2``.
+For example, take ``sm_90`` (9 represents ``X``, 0 represents ``Y``, and no feature
+suffix) and ``sm_103`` architectures (10 represents ``X``, 3 represents ``Y``, and no
+feature suffix). Since 90 <= 103, ``sm_90`` is compatible with ``sm_103``.
+
+The family-specific variants have ``f`` feature suffix and they follow
+following order:
+``sm_X{Y2}f > sm_X{Y1}f`` iff ``Y2 > Y1``
+``sm_XY{f} > sm_{XY}{}``
+
+For example, take ``sm_100f`` (10 represents ``X``, 0 represents ``Y``, and ``f``
+represents ``z``) and ``sm_103f`` (10 represents ``X``, 3 represents ``Y``, and ``f``
+represents ``z``) architecture variants. Since ``Y1 < Y2``, ``sm_100f`` is compatible with
+``sm_103f``. Similarly based on the second rule, ``sm_90`` is compatible with ``sm_103f``.
+
+Some counter examples, take ``sm_100f`` and ``sm_120f`` (12 represents ``X``, 0
+represents ``Y``, and ``f`` represents ``z``) architecture variants. Since both
+belongs to different family i.e. ``X1 != X2``, ``sm_100f`` is not compatible with
+``sm_120f``.
+
+The architecture-specific variants have ``a`` feature suffix and they follow
+following order:
+``sm_XY{a} > sm_XY{f} > sm_{XY}{}``
+
+For example, take ``sm_103a`` (10 represents ``X``, 3 represents ``Y``, and ``a``
+represents ``z``), ``sm_103f``, and ``sm_103`` architecture variants. The ``sm_103`` is
+compatible with ``sm_103a`` and ``sm_103f``, and ``sm_103f`` is compatible with ``sm_103a``.
+
+Encoding := Arch * 100 + 10 (for 'f') + 1 (for 'a')
+Arch := X * 10 + Y
+
+For example, ``sm_103a`` is encoded as 10311 (103 * 100 + 10 + 1) and ``sm_103f`` is
+encoded as 10310 (103 * 100 + 10).
+
+This encoding allows simple partial ordering of the architectures.
+
+* Compare Family and Arch by dividing FullSMVersion by 1000 and 100
+  respectively before the comparison.
+* Compare within the family by comparing FullSMVersion, given both belongs to
+  the same family.
+* Detect ``a`` variants by checking FullSMVersion % 10.
 
 .. _nvptx_intrinsics:
 
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 802e9873a6e20..4085cc4d648f9 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -45,23 +45,28 @@ class FeaturePTX<int version>:
 // suffix) and sm_103 architectures (10 represents 'X', 3 represents 'Y', and no
 // feature suffix). Since 90 <= 103, sm_90 is compatible with sm_103.
 //
-// The family-specific architectures have 'f' feature suffix and they follow
+// The family-specific variants have 'f' feature suffix and they follow
 // following order:
 // sm_X{Y2}f > sm_X{Y1}f iff Y2 > Y1
 // sm_XY{f} > sm_{XY}{}
 //
 // For example, take sm_100f (10 represents 'X', 0 represents 'Y', and 'f'
 // represents 'z') and sm_103f (10 represents 'X', 3 represents 'Y', and 'f'
-// represents 'z') architectures. Since Y1 < Y2, sm_100f is compatible with
+// represents 'z') architecture variants. Since Y1 < Y2, sm_100f is compatible with
 // sm_103f. Similarly based on the second rule, sm_90 is compatible with sm_103f.
 //
-// The architecture-specific architectures have 'a' feature suffix and they follow
+// Some counter examples, take sm_100f and sm_120f (12 represents 'X', 0
+// represents 'Y', and 'f' represents 'z') architecture variants. Since both
+// belongs to different family i.e. X1 != X2, sm_100f is not compatible with
+// sm_120f.
+//
+// The architecture-specific variants have 'a' feature suffix and they follow
 // following order:
 // sm_XY{a} > sm_XY{f} > sm_{XY}{}
 //
 // For example, take sm_103a (10 represents 'X', 3 represents 'Y', and 'a'
-// represents 'z'), sm_103f, and sm_103 architectures. The sm_103 is compatible
-// with sm_103a and sm_103f, and sm_103f is compatible with sm_103a.
+// represents 'z'), sm_103f, and sm_103 architecture variants. The sm_103 is
+// compatible with sm_103a and sm_103f, and sm_103f is compatible with sm_103a.
 //
 // Encoding := Arch * 100 + 10 (for 'f') + 1 (for 'a')
 // Arch := X * 10 + Y
@@ -83,15 +88,13 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
 
   // Family-specific targets which are compatible within same family
   // (e.g. FullSMVersion for sm_100f is 10010)
-  if !ge(sm, 100) then {
+  if !ge(sm, 100) then
     def SM#sm#f : FeatureSM<""#sm#"f", !add(!mul(sm, 100), 10)>;
-  }
 
   // Architecture-specific targets which are incompatible across architectures
   // (e.g. FullSMVersion for sm_100a is 10011)
-  if !ge(sm, 90) then {
+  if !ge(sm, 90) then
     def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 100), 11)>;
-  }
 }
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,

>From c3db5756724b0d2cef685fbcdffd445474d6fbf0 Mon Sep 17 00:00:00 2001
From: rbajpai <rbajpai at nvidia.com>
Date: Fri, 13 Jun 2025 18:44:15 +0530
Subject: [PATCH 4/5] Fix doc failure

---
 llvm/docs/NVPTXUsage.rst               | 2 +-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index e89ac9be54738..bf46cb0235054 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -150,7 +150,7 @@ Example: 64-bit PTX for CUDA Driver API: ``nvptx64-nvidia-cuda``
 .. _nvptx_arch_hierarchy:
 
 NVPTX Architecture Hierarchy and Ordering
-========================================
+=========================================
 
 GPU architectures: sm_2Y/sm_3Y/sm_5Y/sm_6Y/sm_7Y/sm_8Y/sm_9Y/sm_10Y/sm_12Y
 ('Y' represents version within the architecture)
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index c48dc91595f2e..b4a8a3389970d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -120,9 +120,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // TMA G2S copy with cta_group::1/2 support
   bool hasCpAsyncBulkTensorCTAGroupSupport() const {
     // TODO: Update/tidy-up after the family-conditional support arrives
-    return ((FullSmVersion == 1001 || FullSmVersion == 1011) &&
+    return ((FullSmVersion == 10011 || FullSmVersion == 10111) &&
             PTXVersion >= 86) ||
-           (FullSmVersion == 1031 && PTXVersion >= 88);
+           (FullSmVersion == 10311 && PTXVersion >= 88);
   }
 
   // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction

>From 444c46881b25dc4cec0dc47c8a26bc6c159378e7 Mon Sep 17 00:00:00 2001
From: rbajpai <rbajpai at nvidia.com>
Date: Wed, 18 Jun 2025 12:37:12 +0530
Subject: [PATCH 5/5] Update design to use separate bit instead of digit

---
 llvm/docs/NVPTXUsage.rst                 | 10 +++++-----
 llvm/lib/Target/NVPTX/NVPTX.td           | 22 +++++++++++-----------
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp |  2 +-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h   | 24 +++++++++++++++---------
 4 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index bf46cb0235054..1213ca684eee1 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -185,19 +185,19 @@ For example, take ``sm_103a`` (10 represents ``X``, 3 represents ``Y``, and ``a`
 represents ``z``), ``sm_103f``, and ``sm_103`` architecture variants. The ``sm_103`` is
 compatible with ``sm_103a`` and ``sm_103f``, and ``sm_103f`` is compatible with ``sm_103a``.
 
-Encoding := Arch * 100 + 10 (for 'f') + 1 (for 'a')
+Encoding := Arch * 10 + 2 (for 'f') + 1 (for 'a')
 Arch := X * 10 + Y
 
-For example, ``sm_103a`` is encoded as 10311 (103 * 100 + 10 + 1) and ``sm_103f`` is
-encoded as 10310 (103 * 100 + 10).
+For example, ``sm_103a`` is encoded as 1033 (103 * 10 + 2 + 1) and ``sm_103f`` is
+encoded as 1032 (103 * 10 + 2).
 
 This encoding allows simple partial ordering of the architectures.
 
-* Compare Family and Arch by dividing FullSMVersion by 1000 and 100
+* Compare Family and Arch by dividing FullSMVersion by 100 and 10
   respectively before the comparison.
 * Compare within the family by comparing FullSMVersion, given both belongs to
   the same family.
-* Detect ``a`` variants by checking FullSMVersion % 10.
+* Detect ``a`` variants by checking FullSMVersion & 1.
 
 .. _nvptx_intrinsics:
 
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 4085cc4d648f9..83992606bc419 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -68,33 +68,33 @@ class FeaturePTX<int version>:
 // represents 'z'), sm_103f, and sm_103 architecture variants. The sm_103 is
 // compatible with sm_103a and sm_103f, and sm_103f is compatible with sm_103a.
 //
-// Encoding := Arch * 100 + 10 (for 'f') + 1 (for 'a')
+// Encoding := Arch * 10 + 2 (for 'f') + 1 (for 'a')
 // Arch := X * 10 + Y
 //
-// For example, sm_103a is encoded as 10311 (103 * 100 + 10 + 1) and sm_103f is
-// encoded as 10310 (103 * 100 + 10).
+// For example, sm_103a is encoded as 1033 (103 * 10 + 2 + 1) and sm_103f is
+// encoded as 1032 (103 * 10 + 2).
 //
 // This encoding allows simple partial ordering of the architectures.
-//  + Compare Family and Arch by dividing FullSMVersion by 1000 and 100
+//  + Compare Family and Arch by dividing FullSMVersion by 100 and 10
 //    respectively before the comparison.
 //  + Compare within the family by comparing FullSMVersion, given both belongs to
 //    the same family.
-//  + Detect 'a' variants by checking FullSMVersion % 10.
+//  + Detect 'a' variants by checking FullSMVersion & 1.
 foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
               60, 61, 62, 70, 72, 75, 80, 86, 87,
               89, 90, 100, 101, 103, 120, 121] in {
-  // Base SM version (e.g. FullSMVersion for sm_100 is 10000)
-  def SM#sm : FeatureSM<""#sm, !mul(sm, 100)>;
+  // Base SM version (e.g. FullSMVersion for sm_100 is 1000)
+  def SM#sm : FeatureSM<""#sm, !mul(sm, 10)>;
 
   // Family-specific targets which are compatible within same family
-  // (e.g. FullSMVersion for sm_100f is 10010)
+  // (e.g. FullSMVersion for sm_100f is 1002)
   if !ge(sm, 100) then
-    def SM#sm#f : FeatureSM<""#sm#"f", !add(!mul(sm, 100), 10)>;
+    def SM#sm#f : FeatureSM<""#sm#"f", !add(!mul(sm, 10), 2)>;
 
   // Architecture-specific targets which are incompatible across architectures
-  // (e.g. FullSMVersion for sm_100a is 10011)
+  // (e.g. FullSMVersion for sm_100a is 1003)
   if !ge(sm, 90) then
-    def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 100), 11)>;
+    def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 10), 3)>;
 }
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 0b2b5dfe88e00..e5d680c19d921 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -55,7 +55,7 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS,
                                const NVPTXTargetMachine &TM)
     : NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0),
-      FullSmVersion(2000), SmVersion(getSmVersion()),
+      FullSmVersion(200), SmVersion(getSmVersion()),
       TLInfo(TM, initializeSubtargetDependencies(CPU, FS)) {
   TSInfo = std::make_unique<NVPTXSelectionDAGInfo>();
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index b4a8a3389970d..f8aff9eaf2b9d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -108,8 +108,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
     switch (FullSmVersion) {
     default:
       break;
-    case 10011: // sm_100a
-    case 10111: // sm_101a
+    case 1003: // sm_100a
+    case 1013: // sm_101a
       HasTcgen05 = true;
       break;
     }
@@ -120,9 +120,15 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // TMA G2S copy with cta_group::1/2 support
   bool hasCpAsyncBulkTensorCTAGroupSupport() const {
     // TODO: Update/tidy-up after the family-conditional support arrives
-    return ((FullSmVersion == 10011 || FullSmVersion == 10111) &&
-            PTXVersion >= 86) ||
-           (FullSmVersion == 10311 && PTXVersion >= 88);
+    switch (FullSmVersion) {
+    case 1003:
+    case 1013:
+      return PTXVersion >= 86;
+    case 1033:
+      return PTXVersion >= 88;
+    default:
+      return false;
+    }
   }
 
   // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
@@ -135,7 +141,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
   bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
-  unsigned int getSmVersion() const { return getFullSmVersion() / 100; }
+  unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
   // GPUs with "a" suffix have include architecture-accelerated features that
   // are supported on the specified architecture only, hence such targets do not
   // follow the onion layer model. hasArchAccelFeatures() allows
@@ -143,7 +149,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // - false represents non-accelerated architecture.
   // - true represents architecture-accelerated variant.
   bool hasArchAccelFeatures() const {
-    return getFullSmVersion() % 10 && PTXVersion >= 80;
+    return (getFullSmVersion() & 1) && PTXVersion >= 80;
   }
   // GPUs with 'f' suffix have architecture-accelerated features which are
   // portable across all future architectures under same SM major. For example,
@@ -151,8 +157,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // - false represents non-family-specific architecture.
   // - true represents family-specific variant.
   bool hasFamilySpecificFeatures() const {
-    return getFullSmVersion() % 100 == 10 ? PTXVersion >= 88
-                                          : hasArchAccelFeatures();
+    return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
+                                        : hasArchAccelFeatures();
   }
   // If the user did not provide a target we default to the `sm_30` target.
   std::string getTargetName() const {