[llvm] [BinaryFormat][ELF][CUDA] Extended binary information for CUDA (PR #133076)

via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 26 06:17:31 PDT 2025


https://github.com/D-CUDA created https://github.com/llvm/llvm-project/pull/133076

Extended binary information for CUDA.
Provided information for cuda specific relocations
Swaped to decimal enum values for CUDA_SM* to make them more intuitive

>From 26c0c9ef8bc63ccf1ddacfdd59536e227dd979ff Mon Sep 17 00:00:00 2001
From: D-CUDA <dengers09 at mail.ru>
Date: Wed, 26 Mar 2025 16:01:38 +0300
Subject: [PATCH] [BinaryFormat][ELF][CUDA] Extended binary information for
 CUDA, provided info for cuda specific relocations

---
 llvm/include/llvm/BinaryFormat/ELF.h          |  58 +++++---
 .../llvm/BinaryFormat/ELFRelocs/CUDA.def      | 130 ++++++++++++++++++
 llvm/lib/MC/MCSymbolELF.cpp                   |  24 ++--
 llvm/lib/Object/ELF.cpp                       |   7 +
 4 files changed, 185 insertions(+), 34 deletions(-)
 create mode 100644 llvm/include/llvm/BinaryFormat/ELFRelocs/CUDA.def

diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 37eab89e706db..43acdbdb46424 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -907,38 +907,47 @@ enum {
 #include "ELFRelocs/AMDGPU.def"
 };
 
-// NVPTX specific e_flags.
+// CUDA specific st_other
+enum : unsigned {
+  STO_CUDA_ENTRY    = 0x10,  // entry function
+  STO_CUDA_GLOBAL   = 0x20,
+  STO_CUDA_SHARED   = 0x40,
+  STO_CUDA_LOCAL    = 0x60,
+  STO_CUDA_CONSTANT = 0x80
+};
+
+// CUDA specific e_flags.
 enum : unsigned {
   // Processor selection mask for EF_CUDA_SM* values.
   EF_CUDA_SM = 0xff,
 
   // SM based processor values.
-  EF_CUDA_SM20 = 0x14,
-  EF_CUDA_SM21 = 0x15,
-  EF_CUDA_SM30 = 0x1e,
-  EF_CUDA_SM32 = 0x20,
-  EF_CUDA_SM35 = 0x23,
-  EF_CUDA_SM37 = 0x25,
-  EF_CUDA_SM50 = 0x32,
-  EF_CUDA_SM52 = 0x34,
-  EF_CUDA_SM53 = 0x35,
-  EF_CUDA_SM60 = 0x3c,
-  EF_CUDA_SM61 = 0x3d,
-  EF_CUDA_SM62 = 0x3e,
-  EF_CUDA_SM70 = 0x46,
-  EF_CUDA_SM72 = 0x48,
-  EF_CUDA_SM75 = 0x4b,
-  EF_CUDA_SM80 = 0x50,
-  EF_CUDA_SM86 = 0x56,
-  EF_CUDA_SM87 = 0x57,
-  EF_CUDA_SM89 = 0x59,
+  EF_CUDA_SM20 = 20,
+  EF_CUDA_SM21 = 21,
+  EF_CUDA_SM30 = 30,
+  EF_CUDA_SM32 = 32,
+  EF_CUDA_SM35 = 35,
+  EF_CUDA_SM37 = 37,
+  EF_CUDA_SM50 = 50,
+  EF_CUDA_SM52 = 52,
+  EF_CUDA_SM53 = 53,
+  EF_CUDA_SM60 = 60,
+  EF_CUDA_SM61 = 61,
+  EF_CUDA_SM62 = 62,
+  EF_CUDA_SM70 = 70,
+  EF_CUDA_SM72 = 72,
+  EF_CUDA_SM75 = 75,
+  EF_CUDA_SM80 = 80,
+  EF_CUDA_SM86 = 86,
+  EF_CUDA_SM87 = 87,
+  EF_CUDA_SM89 = 89,
   // The sm_90a variant uses the same machine flag.
-  EF_CUDA_SM90 = 0x5a,
+  EF_CUDA_SM90 = 90,
 
   // Unified texture binding is enabled.
   EF_CUDA_TEXMODE_UNIFIED = 0x100,
   // Independent texture binding is enabled.
-  EF_CUDA_TEXMODE_INDEPENDANT = 0x200,
+  EF_CUDA_TEXMODE_INDEPENDENT = 0x200,
   // The target is using 64-bit addressing.
   EF_CUDA_64BIT_ADDRESS = 0x400,
   // Set when using the sm_90a processor.
@@ -950,6 +959,11 @@ enum : unsigned {
   EF_CUDA_VIRTUAL_SM = 0xff0000,
 };
 
+// ELF Relocation types for CUDA
+enum {
+#include "ELFRelocs/CUDA.def"
+};
+
 // ELF Relocation types for BPF
 enum {
 #include "ELFRelocs/BPF.def"
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/CUDA.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/CUDA.def
new file mode 100644
index 0000000000000..1cb562664b2f0
--- /dev/null
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/CUDA.def
@@ -0,0 +1,130 @@
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+ELF_RELOC(R_CUDA_NONE,                   0)    // no relocation
+ELF_RELOC(R_CUDA_32,                     1)    // 32bit specific address
+ELF_RELOC(R_CUDA_64,                     2)    // 64bit specific address
+ELF_RELOC(R_CUDA_G32,                    3)    // 32bit generic address
+ELF_RELOC(R_CUDA_G64,                    4)    // 64bit generic address
+ELF_RELOC(R_CUDA_ABS32_26,               5)    // absolute_address(sym) -> bits 26-57
+ELF_RELOC(R_CUDA_TEX_HEADER_INDEX,       6)    // header_index(tex) -> bits 0-19
+ELF_RELOC(R_CUDA_SAMP_HEADER_INDEX,      7)    // header_index(samp) -> bits 20-31
+ELF_RELOC(R_CUDA_SURF_HW_DESC,           8)    // hw_descriptor(surf) -> 32 bytes
+ELF_RELOC(R_CUDA_SURF_HW_SW_DESC,        9)    // hw and sw descriptor(surf) -> 32 + size computed from EIATTR_QUERY_NUMATTTRIB
+ELF_RELOC(R_CUDA_ABS32_LO_26,            10)   // lower 32bits of 64bit absolute_address(sym) -> bits 26-57
+ELF_RELOC(R_CUDA_ABS32_HI_26,            11)   // upper 32bits of 64bit absolute_address(sym) -> bits 26-57
+ELF_RELOC(R_CUDA_ABS32_23,               12)   // absolute_address(sym) -> bits 23-54
+ELF_RELOC(R_CUDA_ABS32_LO_23,            13)   // lower 32bits of 64bit absolute_address(sym) -> bits 23-54
+ELF_RELOC(R_CUDA_ABS32_HI_23,            14)   // upper 32bits of 64bit absolute_address(sym) -> bits 23-54
+ELF_RELOC(R_CUDA_ABS24_26,               15)   // 24bit absolute_address(sym) -> bits 26-49
+ELF_RELOC(R_CUDA_ABS24_23,               16)   // 24bit absolute_address(sym) -> bits 23-46
+ELF_RELOC(R_CUDA_ABS16_26,               17)   // 16bit absolute_address(sym) -> bits 26-41
+ELF_RELOC(R_CUDA_ABS16_23,               18)   // 16bit absolute_address(sym) -> bits 23-38
+ELF_RELOC(R_CUDA_TEX_SLOT,               19)   // 8bit slot of tex sym  -> bits 32-39
+ELF_RELOC(R_CUDA_SAMP_SLOT,              20)   // 5bit slot of samp sym -> bits 40-44
+ELF_RELOC(R_CUDA_SURF_SLOT,              21)   // 6bit slot of surf sym -> bits 26-31
+ELF_RELOC(R_CUDA_TEX_BINDLESSOFF13_32,   22)   // 13 bit (bindless offset allocated to tex sym >> 2) -> bits 32-44
+ELF_RELOC(R_CUDA_TEX_BINDLESSOFF13_47,   23)   // 13 bit (bindless offset allocated to tex sym >> 2) -> bits 47-59
+ELF_RELOC(R_CUDA_CONST_FIELD19_28,       24)   // 14 bit (((offset of sym + addend) & 0xffffU) >> 2) -> bits 28-41
+                                               // 4bit (constant bank for sym & 0xF) -> bits 42-45
+                                               // 1bit (constant bank for sym >> 4) & 0x1 -> bit 26-26
+ELF_RELOC(R_CUDA_CONST_FIELD19_23,       25)   // 14 bit (((offset of sym + addend) & 0xffffU) >> 2) -> bits 23-36
+                                               // 5 bit (constant bank for sym & 0x1F) -> bits 37-41
+ELF_RELOC(R_CUDA_TEX_SLOT9_49,           26)   // 9bit slot of tex sym -> bits 49-57
+ELF_RELOC(R_CUDA_6_31,                   27)   // 6bit sym value -> bits 31-36
+ELF_RELOC(R_CUDA_2_47,                   28)   // 2bit sym value -> bits 47-48
+ELF_RELOC(R_CUDA_TEX_BINDLESSOFF13_41,   29)   // 13 bit (bindless offset allocated to tex sym >> 2) -> bits 41-53
+ELF_RELOC(R_CUDA_TEX_BINDLESSOFF13_45,   30)   // 13 bit (bindless offset allocated to tex sym >> 2) -> bits 45-57
+ELF_RELOC(R_CUDA_FUNC_DESC32_23,         31)   // 32 bit function descriptor of entry function -> bits 23-54
+ELF_RELOC(R_CUDA_FUNC_DESC32_LO_23,      32)   // lower 32bits of 64bit function descriptor of entry function -> bits 23-54
+ELF_RELOC(R_CUDA_FUNC_DESC32_HI_23,      33)   // upper 32bits of 64bit function descriptor of entry function -> bits 23-54
+ELF_RELOC(R_CUDA_FUNC_DESC_32,           34)   // 32bit address of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_64,           35)   // 64bit address of function descriptor of entry function
+ELF_RELOC(R_CUDA_CONST_FIELD21_26,       36)   // 16bit ((offset of sym + addend) & 0xffffU) -> bits 26-41
+                                               // 5bit (constant bank for sym & 0x1F) -> bits 42-46
+ELF_RELOC(R_CUDA_QUERY_DESC21_37,        37)   // 16 bit offset of query descriptor -> bits 37-52
+                                               // 5 bit bank used for query descriptor -> bits 53-57
+ELF_RELOC(R_CUDA_CONST_FIELD19_26,       38)   // 14 bit (((offset of sym + addend) & 0xffffU) >> 2) -> bits 26-39
+                                               // 5 bit (constant bank for sym & 0x1F) -> bits 40-44
+ELF_RELOC(R_CUDA_CONST_FIELD21_23,       39)   // 16 bit ((offset of sym + addend) & 0xffffU) -> bits 23-38
+                                               // 5 bit (constant bank for sym & 0x1F) -> bits 39-43
+ELF_RELOC(R_CUDA_PCREL_IMM24_26,         40)   // 24 bit PC relative branch offset -> bits 26-49
+ELF_RELOC(R_CUDA_PCREL_IMM24_23,         41)   // 24 bit PC relative branch offset -> bits 23-46
+ELF_RELOC(R_CUDA_ABS32_20,               42)   // absolute_address(sym) -> bits 20-51
+ELF_RELOC(R_CUDA_ABS32_LO_20,            43)   // lower 32bits of 64bit absolute_address(sym) -> bits 20-51
+ELF_RELOC(R_CUDA_ABS32_HI_20,            44)   // upper 32bits of 64bit absolute_address(sym) -> bits 20-51
+ELF_RELOC(R_CUDA_ABS24_20,               45)   // 24bit absolute_address(sym) -> bits 20-43
+ELF_RELOC(R_CUDA_ABS16_20,               46)   // 16bit absolute_address(sym) -> bits 20-35
+ELF_RELOC(R_CUDA_FUNC_DESC32_20,         47)   // 32 bit function descriptor of entry function -> bits 20-51
+ELF_RELOC(R_CUDA_FUNC_DESC32_LO_20,      48)   // lower 32bits of 64bit function descriptor of entry function -> bits 20-51
+ELF_RELOC(R_CUDA_FUNC_DESC32_HI_20,      49)   // upper 32bits of 64bit function descriptor of entry function -> bits 20-51
+ELF_RELOC(R_CUDA_CONST_FIELD19_20,       50)   // 14 bit (((offset of sym + addend) & 0xffffU) >> 2) -> bits 20-33
+                                               // 5 bit (constant bank for sym & 0x1F) -> bits 34-38
+ELF_RELOC(R_CUDA_BINDLESSOFF13_36,       51)   // 13 bit (bindless offset allocated to tex/surf sym >> 2) -> bits 36-48
+ELF_RELOC(R_CUDA_SURF_HEADER_INDEX,      52)   // header_index(surf) -> bits 0-19
+ELF_RELOC(R_CUDA_INSTRUCTION64,          53)   // replace 64bits of instruction
+ELF_RELOC(R_CUDA_CONST_FIELD21_20,       54)   // 16 bit ((offset of sym + addend) & 0xffffU) -> bits 20-35
+                                               // 5 bit (constant bank for sym & 0x1F) -> bits 36-40
+ELF_RELOC(R_CUDA_ABS32_32,               55)   // 32bit absolute_address(sym) -> bits 32-63
+ELF_RELOC(R_CUDA_ABS32_LO_32,            56)   // lower 32bit of 64bit absolute_address(sym) -> bits 32-63
+ELF_RELOC(R_CUDA_ABS32_HI_32,            57)   // upper 32bit of 64bit absolute_address(sym) -> bits 32-63  
+ELF_RELOC(R_CUDA_ABS47_34,               58)   // 47bit (absolute_address(sym) >> 2) -> bits 34-80. Lower 2bits are assumed 0
+ELF_RELOC(R_CUDA_ABS16_32,               59)   // 16bit absolute_address(sym) -> bits 32-47 
+ELF_RELOC(R_CUDA_ABS24_32,               60)   // 24bit absolute_address(sym) -> bits 32-55 
+ELF_RELOC(R_CUDA_FUNC_DESC32_32,         61)   // 32 bit function descriptor of entry function -> bits 32-63
+ELF_RELOC(R_CUDA_FUNC_DESC32_LO_32,      62)   // lower 32bits of 64bit function descriptor of entry function -> bits 32-63
+ELF_RELOC(R_CUDA_FUNC_DESC32_HI_32,      63)   // upper 32bits of 64bit function descriptor of entry function -> bits 32-64
+ELF_RELOC(R_CUDA_CONST_FIELD19_40,       64)   // 14 bit (((offset of sym + addend) & 0xffffU) >> 2) -> bits 40-53
+                                               // 5 bit (constant bank for sym & 0x1F) -> bits 54-58
+ELF_RELOC(R_CUDA_BINDLESSOFF14_40,       65)   // 14 bit (bindless offset allocated to tex/surf sym >> 2) -> bits 40-53 
+ELF_RELOC(R_CUDA_CONST_FIELD21_38,       66)   // 16 bit ((offset of sym + addend) & 0xffffU) -> bits 38-53
+                                               // 5 bit (constant bank for sym & 0x1F) -> bits 54-58
+ELF_RELOC(R_CUDA_INSTRUCTION128,         67)   // replace 128bits of instruction
+ELF_RELOC(R_CUDA_YIELD_OPCODE9_0,        68)   // 9 bits addend -> bits 0-8. Sym is NULL and only addend will be used.
+ELF_RELOC(R_CUDA_YIELD_CLEAR_PRED4_87,   69)   // Clear 4 bits -> bits 87-90. Sym is NULL and addend is 0
+ELF_RELOC(R_CUDA_32_LO,                  70)   // lower 32bit of 64bit absolute_address(sym) -> bits 0-32
+ELF_RELOC(R_CUDA_32_HI,                  71)   // upper 32bit of 64bit absolute_address(sym) -> bits 0-32  
+ELF_RELOC(R_CUDA_UNUSED_CLEAR32,         72)   // clear 32bits if Sym deleted
+ELF_RELOC(R_CUDA_UNUSED_CLEAR64,         73)   // clear 64bits if Sym deleted
+ELF_RELOC(R_CUDA_ABS24_40,               74)   // 24bit absolute_address(sym) -> bits 40-63
+ELF_RELOC(R_CUDA_ABS55_16_34,            75)   // 55bit (absolute_address(sym) >> 2) -> bits 16-23 and 34-80
+ELF_RELOC(R_CUDA_8_0,                    76)   // Bits 0-7 from specific address
+ELF_RELOC(R_CUDA_8_8,                    77)   // Bits 8-15 from specific address
+ELF_RELOC(R_CUDA_8_16,                   78)   // Bits 16-23 from specific address
+ELF_RELOC(R_CUDA_8_24,                   79)   // Bits 24-31 from specific address
+ELF_RELOC(R_CUDA_8_32,                   80)   // Bits 32-39 from specific address
+ELF_RELOC(R_CUDA_8_40,                   81)   // Bits 40-47 from specific address
+ELF_RELOC(R_CUDA_8_48,                   82)   // Bits 48-55 from specific address
+ELF_RELOC(R_CUDA_8_56,                   83)   // Bits 56-63 from specific address
+ELF_RELOC(R_CUDA_G8_0,                   84)   // Bits 0-7 from generic address
+ELF_RELOC(R_CUDA_G8_8,                   85)   // Bits 8-15 from generic address
+ELF_RELOC(R_CUDA_G8_16,                  86)   // Bits 16-23 from generic address
+ELF_RELOC(R_CUDA_G8_24,                  87)   // Bits 24-31 from generic address
+ELF_RELOC(R_CUDA_G8_32,                  88)   // Bits 32-39 from generic address
+ELF_RELOC(R_CUDA_G8_40,                  89)   // Bits 40-47 from generic address
+ELF_RELOC(R_CUDA_G8_48,                  90)   // Bits 48-55 from generic address
+ELF_RELOC(R_CUDA_G8_56,                  91)   // Bits 56-63 from generic address
+ELF_RELOC(R_CUDA_FUNC_DESC_8_0,          92)   // Bits 0-7 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_8,          93)   // Bits 8-15 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_16,         94)   // Bits 16-23 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_24,         95)   // Bits 24-31 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_32,         96)   // Bits 32-39 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_40,         97)   // Bits 40-47 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_48,         98)   // Bits 48-55 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_56,         99)   // Bits 56-63 of function descriptor of entry function
+ELF_RELOC(R_CUDA_ABS20_44,               100)  // Bits 44-63 of function descriptor of entry function
+ELF_RELOC(R_CUDA_SAMP_HEADER_INDEX_0,    101)  // header_index(samp) -> bits 0-11
+ELF_RELOC(R_CUDA_UNIFIED,                102)  // 64bit unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_32,             103)  // 32bit unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_0,            104)  // Bits 0-7 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_8,            105)  // Bits 8-15 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_16,           106)  // Bits 16-23 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_24,           107)  // Bits 24-31 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_32,           108)  // Bits 32-39 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_40,           109)  // Bits 40-47 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_48,           110)  // Bits 48-55 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_56,           111)  // Bits 56-63 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED32_LO_32,        112)  // Lower 32bits of unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED32_HI_32,        113)  // Higher 32bits of unified address of symbol
+ELF_RELOC(R_CUDA_NONE_LAST,              114)
\ No newline at end of file
diff --git a/llvm/lib/MC/MCSymbolELF.cpp b/llvm/lib/MC/MCSymbolELF.cpp
index 5a3814867e3d8..903efd5153b01 100644
--- a/llvm/lib/MC/MCSymbolELF.cpp
+++ b/llvm/lib/MC/MCSymbolELF.cpp
@@ -22,21 +22,21 @@ enum {
   // Shift value for STV_* flags. 4 possible values, 2 bits.
   ELF_STV_Shift = 5,
 
-  // Shift value for STO_* flags. 3 bits. All the values are between 0x20 and
-  // 0xe0, so we shift right by 5 before storing.
+  // Shift value for STO_* flags. 4 bits. All the values are between 0x10 and
+  // 0xf0, so we shift right by 4 before storing.
   ELF_STO_Shift = 7,
 
   // One bit.
-  ELF_IsSignature_Shift = 10,
+  ELF_IsSignature_Shift = 11,
 
   // One bit.
-  ELF_WeakrefUsedInReloc_Shift = 11,
+  ELF_WeakrefUsedInReloc_Shift = 12,
 
   // One bit.
-  ELF_BindingSet_Shift = 12,
+  ELF_BindingSet_Shift = 13,
 
   // One bit.
-  ELF_IsMemoryTagged_Shift = 13,
+  ELF_IsMemoryTagged_Shift = 14,
 };
 }
 
@@ -158,16 +158,16 @@ unsigned MCSymbolELF::getVisibility() const {
 }
 
 void MCSymbolELF::setOther(unsigned Other) {
-  assert((Other & 0x1f) == 0);
-  Other >>= 5;
-  assert(Other <= 0x7);
-  uint32_t OtherFlags = getFlags() & ~(0x7 << ELF_STO_Shift);
+  assert((Other & 0xf) == 0);
+  Other >>= 4;
+  assert(Other <= 0xf);
+  uint32_t OtherFlags = getFlags() & ~(0xf << ELF_STO_Shift);
   setFlags(OtherFlags | (Other << ELF_STO_Shift));
 }
 
 unsigned MCSymbolELF::getOther() const {
-  unsigned Other = (Flags >> ELF_STO_Shift) & 7;
-  return Other << 5;
+  unsigned Other = (Flags >> ELF_STO_Shift) & 0xf;
+  return Other << 4;
 }
 
 void MCSymbolELF::setIsWeakrefUsedInReloc() const {
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index bf42c92a242a1..925edd495843f 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -182,6 +182,13 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,
       break;
     }
     break;
+  case ELF::EM_CUDA:
+    switch (Type) {
+#include "llvm/BinaryFormat/ELFRelocs/CUDA.def"
+    default:
+      break;
+    }
+    break;
   default:
     break;
   }



More information about the llvm-commits mailing list