[llvm] [BinaryFormat][ELF][CUDA] Extended binary information for CUDA (PR #133076)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 26 06:17:31 PDT 2025
https://github.com/D-CUDA created https://github.com/llvm/llvm-project/pull/133076
Extended binary information for CUDA.
Provided information for cuda specific relocations
Swaped to decimal enum values for CUDA_SM* to make them more intuitive
>From 26c0c9ef8bc63ccf1ddacfdd59536e227dd979ff Mon Sep 17 00:00:00 2001
From: D-CUDA <dengers09 at mail.ru>
Date: Wed, 26 Mar 2025 16:01:38 +0300
Subject: [PATCH] [BinaryFormat][ELF][CUDA] Extended binary information for
CUDA, provided info for cuda specific relocations
---
llvm/include/llvm/BinaryFormat/ELF.h | 58 +++++---
.../llvm/BinaryFormat/ELFRelocs/CUDA.def | 130 ++++++++++++++++++
llvm/lib/MC/MCSymbolELF.cpp | 24 ++--
llvm/lib/Object/ELF.cpp | 7 +
4 files changed, 185 insertions(+), 34 deletions(-)
create mode 100644 llvm/include/llvm/BinaryFormat/ELFRelocs/CUDA.def
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 37eab89e706db..43acdbdb46424 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -907,38 +907,47 @@ enum {
#include "ELFRelocs/AMDGPU.def"
};
-// NVPTX specific e_flags.
+// CUDA specific st_other
+enum : unsigned {
+ STO_CUDA_ENTRY = 0x10, // entry function
+ STO_CUDA_GLOBAL = 0x20,
+ STO_CUDA_SHARED = 0x40,
+ STO_CUDA_LOCAL = 0x60,
+ STO_CUDA_CONSTANT = 0x80
+};
+
+// CUDA specific e_flags.
enum : unsigned {
// Processor selection mask for EF_CUDA_SM* values.
EF_CUDA_SM = 0xff,
// SM based processor values.
- EF_CUDA_SM20 = 0x14,
- EF_CUDA_SM21 = 0x15,
- EF_CUDA_SM30 = 0x1e,
- EF_CUDA_SM32 = 0x20,
- EF_CUDA_SM35 = 0x23,
- EF_CUDA_SM37 = 0x25,
- EF_CUDA_SM50 = 0x32,
- EF_CUDA_SM52 = 0x34,
- EF_CUDA_SM53 = 0x35,
- EF_CUDA_SM60 = 0x3c,
- EF_CUDA_SM61 = 0x3d,
- EF_CUDA_SM62 = 0x3e,
- EF_CUDA_SM70 = 0x46,
- EF_CUDA_SM72 = 0x48,
- EF_CUDA_SM75 = 0x4b,
- EF_CUDA_SM80 = 0x50,
- EF_CUDA_SM86 = 0x56,
- EF_CUDA_SM87 = 0x57,
- EF_CUDA_SM89 = 0x59,
+ EF_CUDA_SM20 = 20,
+ EF_CUDA_SM21 = 21,
+ EF_CUDA_SM30 = 30,
+ EF_CUDA_SM32 = 32,
+ EF_CUDA_SM35 = 35,
+ EF_CUDA_SM37 = 37,
+ EF_CUDA_SM50 = 50,
+ EF_CUDA_SM52 = 52,
+ EF_CUDA_SM53 = 53,
+ EF_CUDA_SM60 = 60,
+ EF_CUDA_SM61 = 61,
+ EF_CUDA_SM62 = 62,
+ EF_CUDA_SM70 = 70,
+ EF_CUDA_SM72 = 72,
+ EF_CUDA_SM75 = 75,
+ EF_CUDA_SM80 = 80,
+ EF_CUDA_SM86 = 86,
+ EF_CUDA_SM87 = 87,
+ EF_CUDA_SM89 = 89,
// The sm_90a variant uses the same machine flag.
- EF_CUDA_SM90 = 0x5a,
+ EF_CUDA_SM90 = 90,
// Unified texture binding is enabled.
EF_CUDA_TEXMODE_UNIFIED = 0x100,
// Independent texture binding is enabled.
- EF_CUDA_TEXMODE_INDEPENDANT = 0x200,
+ EF_CUDA_TEXMODE_INDEPENDENT = 0x200,
// The target is using 64-bit addressing.
EF_CUDA_64BIT_ADDRESS = 0x400,
// Set when using the sm_90a processor.
@@ -950,6 +959,11 @@ enum : unsigned {
EF_CUDA_VIRTUAL_SM = 0xff0000,
};
+// ELF Relocation types for CUDA
+enum {
+#include "ELFRelocs/CUDA.def"
+};
+
// ELF Relocation types for BPF
enum {
#include "ELFRelocs/BPF.def"
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/CUDA.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/CUDA.def
new file mode 100644
index 0000000000000..1cb562664b2f0
--- /dev/null
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/CUDA.def
@@ -0,0 +1,130 @@
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+ELF_RELOC(R_CUDA_NONE, 0) // no relocation
+ELF_RELOC(R_CUDA_32, 1) // 32bit specific address
+ELF_RELOC(R_CUDA_64, 2) // 64bit specific address
+ELF_RELOC(R_CUDA_G32, 3) // 32bit generic address
+ELF_RELOC(R_CUDA_G64, 4) // 64bit generic address
+ELF_RELOC(R_CUDA_ABS32_26, 5) // absolute_address(sym) -> bits 26-57
+ELF_RELOC(R_CUDA_TEX_HEADER_INDEX, 6) // header_index(tex) -> bits 0-19
+ELF_RELOC(R_CUDA_SAMP_HEADER_INDEX, 7) // header_index(samp) -> bits 20-31
+ELF_RELOC(R_CUDA_SURF_HW_DESC, 8) // hw_descriptor(surf) -> 32 bytes
+ELF_RELOC(R_CUDA_SURF_HW_SW_DESC, 9) // hw and sw descriptor(surf) -> 32 + size computed from EIATTR_QUERY_NUMATTTRIB
+ELF_RELOC(R_CUDA_ABS32_LO_26, 10) // lower 32bits of 64bit absolute_address(sym) -> bits 26-57
+ELF_RELOC(R_CUDA_ABS32_HI_26, 11) // upper 32bits of 64bit absolute_address(sym) -> bits 26-57
+ELF_RELOC(R_CUDA_ABS32_23, 12) // absolute_address(sym) -> bits 23-54
+ELF_RELOC(R_CUDA_ABS32_LO_23, 13) // lower 32bits of 64bit absolute_address(sym) -> bits 23-54
+ELF_RELOC(R_CUDA_ABS32_HI_23, 14) // upper 32bits of 64bit absolute_address(sym) -> bits 23-54
+ELF_RELOC(R_CUDA_ABS24_26, 15) // 24bit absolute_address(sym) -> bits 26-49
+ELF_RELOC(R_CUDA_ABS24_23, 16) // 24bit absolute_address(sym) -> bits 23-46
+ELF_RELOC(R_CUDA_ABS16_26, 17) // 16bit absolute_address(sym) -> bits 26-41
+ELF_RELOC(R_CUDA_ABS16_23, 18) // 16bit absolute_address(sym) -> bits 23-38
+ELF_RELOC(R_CUDA_TEX_SLOT, 19) // 8bit slot of tex sym -> bits 32-39
+ELF_RELOC(R_CUDA_SAMP_SLOT, 20) // 5bit slot of samp sym -> bits 40-44
+ELF_RELOC(R_CUDA_SURF_SLOT, 21) // 6bit slot of surf sym -> bits 26-31
+ELF_RELOC(R_CUDA_TEX_BINDLESSOFF13_32, 22) // 13 bit (bindless offset allocated to tex sym >> 2) -> bits 32-44
+ELF_RELOC(R_CUDA_TEX_BINDLESSOFF13_47, 23) // 13 bit (bindless offset allocated to tex sym >> 2) -> bits 47-59
+ELF_RELOC(R_CUDA_CONST_FIELD19_28, 24) // 14 bit (((offset of sym + addend) & 0xffffU) >> 2) -> bits 28-41
+ // 4bit (constant bank for sym & 0xF) -> bits 42-45
+ // 1bit (constant bank for sym >> 4) & 0x1 -> bit 26-26
+ELF_RELOC(R_CUDA_CONST_FIELD19_23, 25) // 14 bit (((offset of sym + addend) & 0xffffU) >> 2) -> bits 23-36
+ // 5 bit (constant bank for sym & 0x1F) -> bits 37-41
+ELF_RELOC(R_CUDA_TEX_SLOT9_49, 26) // 9bit slot of tex sym -> bits 49-57
+ELF_RELOC(R_CUDA_6_31, 27) // 6bit sym value -> bits 31-36
+ELF_RELOC(R_CUDA_2_47, 28) // 2bit sym value -> bits 47-48
+ELF_RELOC(R_CUDA_TEX_BINDLESSOFF13_41, 29) // 13 bit (bindless offset allocated to tex sym >> 2) -> bits 41-53
+ELF_RELOC(R_CUDA_TEX_BINDLESSOFF13_45, 30) // 13 bit (bindless offset allocated to tex sym >> 2) -> bits 45-57
+ELF_RELOC(R_CUDA_FUNC_DESC32_23, 31) // 32 bit function descriptor of entry function -> bits 23-54
+ELF_RELOC(R_CUDA_FUNC_DESC32_LO_23, 32) // lower 32bits of 64bit function descriptor of entry function -> bits 23-54
+ELF_RELOC(R_CUDA_FUNC_DESC32_HI_23, 33) // upper 32bits of 64bit function descriptor of entry function -> bits 23-54
+ELF_RELOC(R_CUDA_FUNC_DESC_32, 34) // 32bit address of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_64, 35) // 64bit address of function descriptor of entry function
+ELF_RELOC(R_CUDA_CONST_FIELD21_26, 36) // 16bit ((offset of sym + addend) & 0xffffU) -> bits 26-41
+ // 5bit (constant bank for sym & 0x1F) -> bits 42-46
+ELF_RELOC(R_CUDA_QUERY_DESC21_37, 37) // 16 bit offset of query descriptor -> bits 37-52
+ // 5 bit bank used for query descriptor -> bits 53-57
+ELF_RELOC(R_CUDA_CONST_FIELD19_26, 38) // 14 bit (((offset of sym + addend) & 0xffffU) >> 2) -> bits 26-39
+ // 5 bit (constant bank for sym & 0x1F) -> bits 40-44
+ELF_RELOC(R_CUDA_CONST_FIELD21_23, 39) // 16 bit ((offset of sym + addend) & 0xffffU) -> bits 23-38
+ // 5 bit (constant bank for sym & 0x1F) -> bits 39-43
+ELF_RELOC(R_CUDA_PCREL_IMM24_26, 40) // 24 bit PC relative branch offset -> bits 26-49
+ELF_RELOC(R_CUDA_PCREL_IMM24_23, 41) // 24 bit PC relative branch offset -> bits 23-46
+ELF_RELOC(R_CUDA_ABS32_20, 42) // absolute_address(sym) -> bits 20-51
+ELF_RELOC(R_CUDA_ABS32_LO_20, 43) // lower 32bits of 64bit absolute_address(sym) -> bits 20-51
+ELF_RELOC(R_CUDA_ABS32_HI_20, 44) // upper 32bits of 64bit absolute_address(sym) -> bits 20-51
+ELF_RELOC(R_CUDA_ABS24_20, 45) // 24bit absolute_address(sym) -> bits 20-43
+ELF_RELOC(R_CUDA_ABS16_20, 46) // 16bit absolute_address(sym) -> bits 20-35
+ELF_RELOC(R_CUDA_FUNC_DESC32_20, 47) // 32 bit function descriptor of entry function -> bits 20-51
+ELF_RELOC(R_CUDA_FUNC_DESC32_LO_20, 48) // lower 32bits of 64bit function descriptor of entry function -> bits 20-51
+ELF_RELOC(R_CUDA_FUNC_DESC32_HI_20, 49) // upper 32bits of 64bit function descriptor of entry function -> bits 20-51
+ELF_RELOC(R_CUDA_CONST_FIELD19_20, 50) // 14 bit (((offset of sym + addend) & 0xffffU) >> 2) -> bits 20-33
+ // 5 bit (constant bank for sym & 0x1F) -> bits 34-38
+ELF_RELOC(R_CUDA_BINDLESSOFF13_36, 51) // 13 bit (bindless offset allocated to tex/surf sym >> 2) -> bits 36-48
+ELF_RELOC(R_CUDA_SURF_HEADER_INDEX, 52) // header_index(surf) -> bits 0-19
+ELF_RELOC(R_CUDA_INSTRUCTION64, 53) // replace 64bits of instruction
+ELF_RELOC(R_CUDA_CONST_FIELD21_20, 54) // 16 bit ((offset of sym + addend) & 0xffffU) -> bits 20-35
+ // 5 bit (constant bank for sym & 0x1F) -> bits 36-40
+ELF_RELOC(R_CUDA_ABS32_32, 55) // 32bit absolute_address(sym) -> bits 32-63
+ELF_RELOC(R_CUDA_ABS32_LO_32, 56) // lower 32bit of 64bit absolute_address(sym) -> bits 32-63
+ELF_RELOC(R_CUDA_ABS32_HI_32, 57) // upper 32bit of 64bit absolute_address(sym) -> bits 32-63
+ELF_RELOC(R_CUDA_ABS47_34, 58) // 47bit (absolute_address(sym) >> 2) -> bits 34-80. Lower 2bits are assumed 0
+ELF_RELOC(R_CUDA_ABS16_32, 59) // 16bit absolute_address(sym) -> bits 32-47
+ELF_RELOC(R_CUDA_ABS24_32, 60) // 24bit absolute_address(sym) -> bits 32-55
+ELF_RELOC(R_CUDA_FUNC_DESC32_32, 61) // 32 bit function descriptor of entry function -> bits 32-63
+ELF_RELOC(R_CUDA_FUNC_DESC32_LO_32, 62) // lower 32bits of 64bit function descriptor of entry function -> bits 32-63
+ELF_RELOC(R_CUDA_FUNC_DESC32_HI_32, 63) // upper 32bits of 64bit function descriptor of entry function -> bits 32-64
+ELF_RELOC(R_CUDA_CONST_FIELD19_40, 64) // 14 bit (((offset of sym + addend) & 0xffffU) >> 2) -> bits 40-53
+ // 5 bit (constant bank for sym & 0x1F) -> bits 54-58
+ELF_RELOC(R_CUDA_BINDLESSOFF14_40, 65) // 14 bit (bindless offset allocated to tex/surf sym >> 2) -> bits 40-53
+ELF_RELOC(R_CUDA_CONST_FIELD21_38, 66) // 16 bit ((offset of sym + addend) & 0xffffU) -> bits 38-53
+ // 5 bit (constant bank for sym & 0x1F) -> bits 54-58
+ELF_RELOC(R_CUDA_INSTRUCTION128, 67) // replace 128bits of instruction
+ELF_RELOC(R_CUDA_YIELD_OPCODE9_0, 68) // 9 bits addend -> bits 0-8. Sym is NULL and only addend will be used.
+ELF_RELOC(R_CUDA_YIELD_CLEAR_PRED4_87, 69) // Clear 4 bits -> bits 87-90. Sym is NULL and addend is 0
+ELF_RELOC(R_CUDA_32_LO, 70) // lower 32bit of 64bit absolute_address(sym) -> bits 0-32
+ELF_RELOC(R_CUDA_32_HI, 71) // upper 32bit of 64bit absolute_address(sym) -> bits 0-32
+ELF_RELOC(R_CUDA_UNUSED_CLEAR32, 72) // clear 32bits if Sym deleted
+ELF_RELOC(R_CUDA_UNUSED_CLEAR64, 73) // clear 64bits if Sym deleted
+ELF_RELOC(R_CUDA_ABS24_40, 74) // 24bit absolute_address(sym) -> bits 40-63
+ELF_RELOC(R_CUDA_ABS55_16_34, 75) // 55bit (absolute_address(sym) >> 2) -> bits 16-23 and 34-80
+ELF_RELOC(R_CUDA_8_0, 76) // Bits 0-7 from specific address
+ELF_RELOC(R_CUDA_8_8, 77) // Bits 8-15 from specific address
+ELF_RELOC(R_CUDA_8_16, 78) // Bits 16-23 from specific address
+ELF_RELOC(R_CUDA_8_24, 79) // Bits 24-31 from specific address
+ELF_RELOC(R_CUDA_8_32, 80) // Bits 32-39 from specific address
+ELF_RELOC(R_CUDA_8_40, 81) // Bits 40-47 from specific address
+ELF_RELOC(R_CUDA_8_48, 82) // Bits 48-55 from specific address
+ELF_RELOC(R_CUDA_8_56, 83) // Bits 56-63 from specific address
+ELF_RELOC(R_CUDA_G8_0, 84) // Bits 0-7 from generic address
+ELF_RELOC(R_CUDA_G8_8, 85) // Bits 8-15 from generic address
+ELF_RELOC(R_CUDA_G8_16, 86) // Bits 16-23 from generic address
+ELF_RELOC(R_CUDA_G8_24, 87) // Bits 24-31 from generic address
+ELF_RELOC(R_CUDA_G8_32, 88) // Bits 32-39 from generic address
+ELF_RELOC(R_CUDA_G8_40, 89) // Bits 40-47 from generic address
+ELF_RELOC(R_CUDA_G8_48, 90) // Bits 48-55 from generic address
+ELF_RELOC(R_CUDA_G8_56, 91) // Bits 56-63 from generic address
+ELF_RELOC(R_CUDA_FUNC_DESC_8_0, 92) // Bits 0-7 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_8, 93) // Bits 8-15 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_16, 94) // Bits 16-23 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_24, 95) // Bits 24-31 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_32, 96) // Bits 32-39 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_40, 97) // Bits 40-47 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_48, 98) // Bits 48-55 of function descriptor of entry function
+ELF_RELOC(R_CUDA_FUNC_DESC_8_56, 99) // Bits 56-63 of function descriptor of entry function
+ELF_RELOC(R_CUDA_ABS20_44, 100) // Bits 44-63 of function descriptor of entry function
+ELF_RELOC(R_CUDA_SAMP_HEADER_INDEX_0, 101) // header_index(samp) -> bits 0-11
+ELF_RELOC(R_CUDA_UNIFIED, 102) // 64bit unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_32, 103) // 32bit unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_0, 104) // Bits 0-7 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_8, 105) // Bits 8-15 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_16, 106) // Bits 16-23 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_24, 107) // Bits 24-31 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_32, 108) // Bits 32-39 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_40, 109) // Bits 40-47 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_48, 110) // Bits 48-55 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED_8_56, 111) // Bits 56-63 from unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED32_LO_32, 112) // Lower 32bits of unified address of symbol
+ELF_RELOC(R_CUDA_UNIFIED32_HI_32, 113) // Higher 32bits of unified address of symbol
+ELF_RELOC(R_CUDA_NONE_LAST, 114)
\ No newline at end of file
diff --git a/llvm/lib/MC/MCSymbolELF.cpp b/llvm/lib/MC/MCSymbolELF.cpp
index 5a3814867e3d8..903efd5153b01 100644
--- a/llvm/lib/MC/MCSymbolELF.cpp
+++ b/llvm/lib/MC/MCSymbolELF.cpp
@@ -22,21 +22,21 @@ enum {
// Shift value for STV_* flags. 4 possible values, 2 bits.
ELF_STV_Shift = 5,
- // Shift value for STO_* flags. 3 bits. All the values are between 0x20 and
- // 0xe0, so we shift right by 5 before storing.
+ // Shift value for STO_* flags. 4 bits. All the values are between 0x10 and
+ // 0xf0, so we shift right by 4 before storing.
ELF_STO_Shift = 7,
// One bit.
- ELF_IsSignature_Shift = 10,
+ ELF_IsSignature_Shift = 11,
// One bit.
- ELF_WeakrefUsedInReloc_Shift = 11,
+ ELF_WeakrefUsedInReloc_Shift = 12,
// One bit.
- ELF_BindingSet_Shift = 12,
+ ELF_BindingSet_Shift = 13,
// One bit.
- ELF_IsMemoryTagged_Shift = 13,
+ ELF_IsMemoryTagged_Shift = 14,
};
}
@@ -158,16 +158,16 @@ unsigned MCSymbolELF::getVisibility() const {
}
void MCSymbolELF::setOther(unsigned Other) {
- assert((Other & 0x1f) == 0);
- Other >>= 5;
- assert(Other <= 0x7);
- uint32_t OtherFlags = getFlags() & ~(0x7 << ELF_STO_Shift);
+ assert((Other & 0xf) == 0);
+ Other >>= 4;
+ assert(Other <= 0xf);
+ uint32_t OtherFlags = getFlags() & ~(0xf << ELF_STO_Shift);
setFlags(OtherFlags | (Other << ELF_STO_Shift));
}
unsigned MCSymbolELF::getOther() const {
- unsigned Other = (Flags >> ELF_STO_Shift) & 7;
- return Other << 5;
+ unsigned Other = (Flags >> ELF_STO_Shift) & 0xf;
+ return Other << 4;
}
void MCSymbolELF::setIsWeakrefUsedInReloc() const {
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index bf42c92a242a1..925edd495843f 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -182,6 +182,13 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,
break;
}
break;
+ case ELF::EM_CUDA:
+ switch (Type) {
+#include "llvm/BinaryFormat/ELFRelocs/CUDA.def"
+ default:
+ break;
+ }
+ break;
default:
break;
}
More information about the llvm-commits
mailing list