[llvm] [RFC][BPF] Support Jump Table (PR #133856)

Sun May 11 11:39:44 PDT 2025

https://github.com/yonghong-song updated https://github.com/llvm/llvm-project/pull/133856

>From 7699a538e3455707330cd993cfa98ba897c296d1 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song at linux.dev>
Date: Mon, 31 Mar 2025 21:25:26 -0700
Subject: [PATCH 1/5] [RFC][BPF] Support Jump Table

NOTE: We probably need cpu v5 or other flags to enable this feature.
We can add it later when necessary.

This patch adds jump table support. A new insn 'gotox <reg>' is
added to allow goto through a register. The register represents
the address in the current section. The function is a concrete
example with bpf selftest progs/user_ringbuf_success.c.

Compilation command line to generate .s file:
=============================================
clang  -g -Wall -Werror -D__TARGET_ARCH_x86 -mlittle-endian \
    -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/include \
    -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf \
    -I/home/yhs/work/bpf-next/tools/include/uapi \
    -I/home/yhs/work/bpf-next/tools/testing/selftests/usr/include -std=gnu11 \
    -fno-strict-aliasing -Wno-compare-distinct-pointer-types \
    -idirafter /home/yhs/work/llvm-project/llvm/build.21/Release/lib/clang/21/include \
    -idirafter /usr/local/include -idirafter /usr/include \
    -DENABLE_ATOMICS_TESTS   -O2 -S progs/user_ringbuf_success.c \
    -o /home/yhs/work/bpf-next/tools/testing/selftests/bpf/user_ringbuf_success.bpf.o.s \
    --target=bpf -mcpu=v3

The related assembly:
  read_protocol_msg:
        ...
        r3 <<= 3
        r1 = .LJTI1_0 ll
        r1 += r3
        r1 = *(u64 *)(r1 + 0)
        gotox r1
  LBB1_4:
        r1 = *(u64 *)(r0 + 8)
        goto LBB1_5
  LBB1_7:
        r1 = *(u64 *)(r0 + 8)
        goto LBB1_8
  LBB1_9:
        w1 = *(u32 *)(r0 + 8)
        r1 <<= 32
        r1 s>>= 32
        r2 = kern_mutated ll
        r3 = *(u64 *)(r2 + 0)
        r3 *= r1
        *(u64 *)(r2 + 0) = r3
        goto LBB1_11
  LBB1_6:
        w1 = *(u32 *)(r0 + 8)
        r1 <<= 32
        r1 s>>= 32
  LBB1_5:
  ...
        .section        .rodata,"a", at progbits
        .p2align        3, 0x0
  .LJTI1_0:
        .quad   LBB1_4
        .quad   LBB1_6
        .quad   LBB1_7
        .quad   LBB1_9
  ...
  publish_next_kern_msg:
        ...
        r6 <<= 3
        r1 = .LJTI6_0 ll
        r1 += r6
        r1 = *(u64 *)(r1 + 0)
        gotox r1
  LBB6_3:
        ...
  LBB6_5:
        ...
  LBB6_6:
        ...
  LBB6_4:
        ...
        .section        .rodata,"a", at progbits
        .p2align        3, 0x0
.LJTI6_0:
        .quad   LBB6_3
        .quad   LBB6_4
        .quad   LBB6_5
        .quad   LBB6_6

Now let us look at .o file
==========================
clang  -g -Wall -Werror -D__TARGET_ARCH_x86 -mlittle-endian \
    -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/include \
    -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf \
    -I/home/yhs/work/bpf-next/tools/include/uapi \
    -I/home/yhs/work/bpf-next/tools/testing/selftests/usr/include \
    -std=gnu11 -fno-strict-aliasing -Wno-compare-distinct-pointer-types \
    -idirafter /home/yhs/work/llvm-project/llvm/build.21/Release/lib/clang/21/include \
    -idirafter /usr/local/include -idirafter /usr/include -DENABLE_ATOMICS_TESTS \
    -O2 -c progs/user_ringbuf_success.c \
    -o /home/yhs/work/bpf-next/tools/testing/selftests/bpf/user_ringbuf_success.bpf.o \
    --target=bpf -mcpu=v3

In obj file, all .rodata sections are merged together. So we have
    $ llvm-readelf -x '.rodata' user_ringbuf_success.bpf.o
    Hex dump of section '.rodata':
    0x00000000 a8020000 00000000 10030000 00000000 ................
    0x00000010 b8020000 00000000 c8020000 00000000 ................
    0x00000020 40040000 00000000 18050000 00000000 @...............
    0x00000030 88040000 00000000 d0040000 00000000 ................
    0x00000040 44726169 6e207265 7475726e 65643a20 Drain returned:
    0x00000050 256c640a 00556e65 78706563 7465646c %ld..Unexpectedl
    0x00000060 79206661 696c6564 20746f20 67657420 y failed to get
    0x00000070 6d73670a 00556e72 65636f67 6e697a65 msg..Unrecognize
    0x00000080 64206f70 2025640a 00256c75 20213d20 d op %d..%lu !=
    0x00000090 256c750a 00627066 5f64796e 7074725f %lu..bpf_dynptr_
    0x000000a0 72656164 28292066 61696c65 643a2025 read() failed: %
    0x000000b0 640a0055 6e657870 65637465 646c7920 d..Unexpectedly
    0x000000c0 6661696c 65642074 6f206765 74207361 failed to get sa
    0x000000d0 6d706c65 0a00                       mple..

Let us look at the insns. Some annotation explains details.
    $ llvm-objdump -Sr user_ringbuf_success.bpf.o
    ....
    Disassembly of section .text:
    0000000000000000 <read_protocol_msg>:
    ;       msg = bpf_dynptr_data(dynptr, 0, sizeof(*msg));
       0:       b4 02 00 00 00 00 00 00 w2 = 0x0
       1:       b4 03 00 00 10 00 00 00 w3 = 0x10
       2:       85 00 00 00 cb 00 00 00 call 0xcb
    ...
    0000000000000268 <handle_sample_msg>:
    ;       switch (msg->msg_op) {
      77:       61 13 00 00 00 00 00 00 w3 = *(u32 *)(r1 + 0x0)
      78:       26 03 1c 00 03 00 00 00 if w3 > 0x3 goto +0x1c <handle_sample_msg+0xf0>
      79:       67 03 00 00 03 00 00 00 r3 <<= 0x3
      80:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
                0000000000000280:  R_BPF_64_64  .rodata
<=== r2 will be the address of .rodata with offset 0.
<=== look at the first 32 bytes of .rodata:
    0x00000000 a8020000 00000000 10030000 00000000 ................
    0x00000010 b8020000 00000000 c8020000 00000000 ................
The four actual addresses are
    0x2a8: insn idx 0x2a8/8 = 85
    0x310: insn idx 0x310/8 = 98
    0x2b8: insn idx 0x2b8/8 = 87
    0x2c8: insn idx 0x2c8/8 = 89

      82:       0f 32 00 00 00 00 00 00 r2 += r3
      83:       79 22 00 00 00 00 00 00 r2 = *(u64 *)(r2 + 0x0)
      84:       0d 02 00 00 00 00 00 00 gotox r2
<=== So eventually gotox will go to the insn idx in this section.
    ;               kern_mutated += msg->operand_64;
      85:       79 11 08 00 00 00 00 00 r1 = *(u64 *)(r1 + 0x8)
      86:       05 00 0e 00 00 00 00 00 goto +0xe <handle_sample_msg+0xc0>
    ;               kern_mutated *= msg->operand_64;
      87:       79 11 08 00 00 00 00 00 r1 = *(u64 *)(r1 + 0x8)
      88:       05 00 03 00 00 00 00 00 goto +0x3 <handle_sample_msg+0x78>
    ;               kern_mutated *= msg->operand_32;
      89:       61 11 08 00 00 00 00 00 w1 = *(u32 *)(r1 + 0x8)
      90:       67 01 00 00 20 00 00 00 r1 <<= 0x20
      91:       c7 01 00 00 20 00 00 00 r1 s>>= 0x20
      92:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
    ...
    00000000000003a0 <publish_next_kern_msg>:
    ; {
     116:       bc 16 00 00 00 00 00 00 w6 = w1
    ;       msg = bpf_ringbuf_reserve(&kernel_ringbuf, sizeof(*msg), 0);
     117:       18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll
                00000000000003a8:  R_BPF_64_64  kernel_ringbuf
     119:       b7 02 00 00 10 00 00 00 r2 = 0x10
     120:       b7 03 00 00 00 00 00 00 r3 = 0x0
     121:       85 00 00 00 83 00 00 00 call 0x83
    ;       if (!msg) {
     122:       55 00 06 00 00 00 00 00 if r0 != 0x0 goto +0x6 <publish_next_kern_msg+0x68>
    ;               err = 4;
     123:       18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll
                00000000000003d8:  R_BPF_64_64  err
     125:       b4 02 00 00 04 00 00 00 w2 = 0x4
     126:       63 21 00 00 00 00 00 00 *(u32 *)(r1 + 0x0) = w2
     127:       b4 00 00 00 01 00 00 00 w0 = 0x1
    ;               return 1;
     128:       05 00 31 00 00 00 00 00 goto +0x31 <publish_next_kern_msg+0x1f0>
    ;       switch (index % TEST_MSG_OP_NUM_OPS) {
     129:       54 06 00 00 03 00 00 00 w6 &= 0x3
     130:       67 06 00 00 03 00 00 00 r6 <<= 0x3
     131:       18 01 00 00 20 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x20 ll
                0000000000000418:  R_BPF_64_64  .rodata
<=== r2 will be the address of .rodata with offset 20.
<=== look at the first 32 bytes of .rodata:
    0x00000020 40040000 00000000 18050000 00000000 @...............
    0x00000030 88040000 00000000 d0040000 00000000 ................
The four actual addresses are
    0x440: insn idx 0x440/8 = 136
    0x518: insn idx 0x518/8 = 163
    0x488: insn idx 0x488/8 = 145
    0x4d0: insn idx 0x4d0/8 = 154
     133:       0f 61 00 00 00 00 00 00 r1 += r6
     134:       79 11 00 00 00 00 00 00 r1 = *(u64 *)(r1 + 0x0)
     135:       0d 01 00 00 00 00 00 00 gotox r1
<=== So eventually gotox will go to the insn idx in this section.
     136:       b4 01 00 00 00 00 00 00 w1 = 0x0
    ;               msg->msg_op = TEST_MSG_OP_INC64;
     137:       63 10 00 00 00 00 00 00 *(u32 *)(r0 + 0x0) = w1
     138:       b7 01 00 00 04 00 00 00 r1 = 0x4
    ;               msg->operand_64 = operand_64;
     139:       7b 10 08 00 00 00 00 00 *(u64 *)(r0 + 0x8) = r1
    ;               expected_user_mutated += operand_64;
     140:       18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll
                0000000000000460:  R_BPF_64_64  expected_user_mutated
     142:       79 11 00 00 00 00 00 00 r1 = *(u64 *)(r1 + 0x0)
     143:       07 01 00 00 04 00 00 00 r1 += 0x4
    ;               break;
     144:       05 00 1a 00 00 00 00 00 goto +0x1a <publish_next_kern_msg+0x1b8>
     145:       b4 01 00 00 02 00 00 00 w1 = 0x2
    ;               msg->msg_op = TEST_MSG_OP_MUL64;
    ...

There are a few things worth to discuss.
First, in the above, it is hard to find jump table size for a particular
relocation ('R_BPF_64_64  .rodata + <offset>'). One thing is to scan through
the whole elf file and you can find all '.rodata + <offset>' relocations.
For example, here we have
   .rodata + 0
   .rodata + 0x20
   .rodata + 0x40
   .rodata + 0x55
   .rodata + 0x75
   .rodata + 0x89
   .rodata + 0x95
   .rodata + 0xb3
With the above information, the size for each sub-rodata can be found easily.

An option -bpf-min-jump-table-entries is implemented to control the minimum
number of entries to use a jump table on BPF. The default value 4, but it
can be changed with the following clang option
  clang ... -mllvm -bpf-min-jump-table-entries=6
where the number of jump table cases needs to be >= 6 in order to
use jump table.
---
 llvm/lib/Target/BPF/BPFISelLowering.cpp | 36 +++++++++++++++++++++++--
 llvm/lib/Target/BPF/BPFISelLowering.h   |  2 ++
 llvm/lib/Target/BPF/BPFInstrInfo.td     | 27 +++++++++++++++++++
 llvm/lib/Target/BPF/BPFMCInstLower.cpp  |  3 +++
 4 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 6c196309d2d1a..cff66ed628140 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -36,6 +36,10 @@ static cl::opt<bool> BPFExpandMemcpyInOrder("bpf-expand-memcpy-in-order",
   cl::Hidden, cl::init(false),
   cl::desc("Expand memcpy into load/store pairs in order"));
 
+static cl::opt<unsigned> BPFMinimumJumpTableEntries(
+    "bpf-min-jump-table-entries", cl::init(4), cl::Hidden,
+    cl::desc("Set minimum number of entries to use a jump table on BPF"));
+
 static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg,
                  SDValue Val = {}) {
   std::string Str;
@@ -65,10 +69,11 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::BRIND, MVT::Other, Expand);
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
 
-  setOperationAction({ISD::GlobalAddress, ISD::ConstantPool}, MVT::i64, Custom);
+  setOperationAction({ISD::GlobalAddress, ISD::ConstantPool, ISD::JumpTable,
+                      ISD::BlockAddress},
+                     MVT::i64, Custom);
 
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
@@ -155,6 +160,7 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
 
   setBooleanContents(ZeroOrOneBooleanContent);
   setMaxAtomicSizeInBitsSupported(64);
+  setMinimumJumpTableEntries(BPFMinimumJumpTableEntries);
 
   // Function alignments
   setMinFunctionAlignment(Align(8));
@@ -312,10 +318,14 @@ SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     report_fatal_error("unimplemented opcode: " + Twine(Op.getOpcode()));
   case ISD::BR_CC:
     return LowerBR_CC(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
   case ISD::GlobalAddress:
     return LowerGlobalAddress(Op, DAG);
   case ISD::ConstantPool:
     return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
   case ISD::SELECT_CC:
     return LowerSELECT_CC(Op, DAG);
   case ISD::SDIV:
@@ -726,6 +736,11 @@ SDValue BPFTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op,
   return Op;
 }
 
+SDValue BPFTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
+  return getAddr(N, DAG);
+}
+
 const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((BPFISD::NodeType)Opcode) {
   case BPFISD::FIRST_NUMBER:
@@ -757,6 +772,17 @@ static SDValue getTargetNode(ConstantPoolSDNode *N, const SDLoc &DL, EVT Ty,
                                    N->getOffset(), Flags);
 }
 
+static SDValue getTargetNode(BlockAddressSDNode *N, const SDLoc &DL, EVT Ty,
+                             SelectionDAG &DAG, unsigned Flags) {
+  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, N->getOffset(),
+                                   Flags);
+}
+
+static SDValue getTargetNode(JumpTableSDNode *N, const SDLoc &DL, EVT Ty,
+                             SelectionDAG &DAG, unsigned Flags) {
+  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flags);
+}
+
 template <class NodeTy>
 SDValue BPFTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
                                    unsigned Flags) const {
@@ -783,6 +809,12 @@ SDValue BPFTargetLowering::LowerConstantPool(SDValue Op,
   return getAddr(N, DAG);
 }
 
+SDValue BPFTargetLowering::LowerBlockAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
+  return getAddr(N, DAG);
+}
+
 unsigned
 BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
                                  unsigned Reg, bool isSigned) const {
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
index 8104895cb7f14..8d5056cfe0ed4 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -80,6 +80,8 @@ class BPFTargetLowering : public TargetLowering {
   SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
 
   template <class NodeTy>
   SDValue getAddr(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index b21f1a0eee3b0..c715bdb01866a 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -183,6 +183,15 @@ class TYPE_LD_ST<bits<3> mode, bits<2> size,
   let Inst{60-59} = size;
 }
 
+// For indirect jump
+class TYPE_IND_JMP<bits<4> op, bits<1> srctype,
+                   dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstBPF<outs, ins, asmstr, pattern> {
+
+  let Inst{63-60} = op;
+  let Inst{59} = srctype;
+}
+
 // jump instructions
 class JMP_RR<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond>
     : TYPE_ALU_JMP<Opc.Value, BPF_X.Value,
@@ -216,6 +225,18 @@ class JMP_RI<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond>
   let BPFClass = BPF_JMP;
 }
 
+class JMP_IND<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
+    : TYPE_ALU_JMP<Opc.Value, BPF_X.Value,
+                   (outs),
+                   (ins GPR:$dst),
+                   !strconcat(OpcodeStr, " $dst"),
+                   Pattern> {
+  bits<4> dst;
+
+  let Inst{51-48} = dst;
+  let BPFClass = BPF_JMP;
+}
+
 class JMP_JCOND<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
     : TYPE_ALU_JMP<Opc.Value, BPF_K.Value,
                    (outs),
@@ -281,6 +302,10 @@ defm JSLT : J<BPF_JSLT, "s<", BPF_CC_LT, BPF_CC_LT_32>;
 defm JSLE : J<BPF_JSLE, "s<=", BPF_CC_LE, BPF_CC_LE_32>;
 defm JSET : J<BPF_JSET, "&", NoCond, NoCond>;
 def JCOND : JMP_JCOND<BPF_JCOND, "may_goto", []>;
+
+let isIndirectBranch = 1 in {
+  def JX : JMP_IND<BPF_JA, "gotox", [(brind i64:$dst)]>;
+}
 }
 
 // ALU instructions
@@ -851,6 +876,8 @@ let usesCustomInserter = 1, isCodeGenOnly = 1 in {
 // load 64-bit global addr into register
 def : Pat<(BPFWrapper tglobaladdr:$in), (LD_imm64 tglobaladdr:$in)>;
 def : Pat<(BPFWrapper tconstpool:$in), (LD_imm64 tconstpool:$in)>;
+def : Pat<(BPFWrapper tblockaddress:$in), (LD_imm64 tblockaddress:$in)>;
+def : Pat<(BPFWrapper tjumptable:$in), (LD_imm64 tjumptable:$in)>;
 
 // 0xffffFFFF doesn't fit into simm32, optimize common case
 def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
diff --git a/llvm/lib/Target/BPF/BPFMCInstLower.cpp b/llvm/lib/Target/BPF/BPFMCInstLower.cpp
index 040a1fb750702..164d172c241c8 100644
--- a/llvm/lib/Target/BPF/BPFMCInstLower.cpp
+++ b/llvm/lib/Target/BPF/BPFMCInstLower.cpp
@@ -77,6 +77,9 @@ void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     case MachineOperand::MO_ConstantPoolIndex:
       MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
       break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
+      break;
     }
 
     OutMI.addOperand(MCOp);

>From e554fb702c929b344accffeaa95c5545782a1f2b Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song at linux.dev>
Date: Wed, 2 Apr 2025 10:27:33 -0700
Subject: [PATCH 2/5] Fir an assert error

---
 llvm/lib/Target/BPF/BPFInstrInfo.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index 70bc163615f61..e61aa62c88f26 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -181,6 +181,10 @@ bool BPFInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     if (!isUnpredicatedTerminator(*I))
       break;
 
+    // If a JX insn, we're done.
+    if (I->getOpcode() == BPF::JX)
+      break;
+
     // A terminator that isn't a branch can't easily be handled
     // by this analysis.
     if (!I->isBranch())

>From 822ead1c2c8016c54fc16b2220d00ac37bbf8077 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song at linux.dev>
Date: Wed, 2 Apr 2025 11:12:15 -0700
Subject: [PATCH 3/5] Generate .llvm_jump_table_sizes section

For example,
  [ 6] .rodata           PROGBITS        0000000000000000 000740 0000d6 00   A  0   0  8
  [ 7] .rel.rodata       REL             0000000000000000 003860 000080 10   I 39   6  8
  [ 8] .llvm_jump_table_sizes LLVM_JT_SIZES 0000000000000000 000816 000010 00      0   0  1
  [ 9] .rel.llvm_jump_table_sizes REL    0000000000000000 0038e0 000010 10   I 39   8  8
  ...
  [14] .llvm_jump_table_sizes LLVM_JT_SIZES 0000000000000000 000958 000010 00      0   0  1
  [15] .rel.llvm_jump_table_sizes REL    0000000000000000 003970 000010 10   I 39  14  8

With llvm-readelf dump section 8 and 14:
  $ llvm-readelf -x 8 user_ringbuf_success.bpf.o
  Hex dump of section '.llvm_jump_table_sizes':
  0x00000000 00000000 00000000 04000000 00000000 ................
  $ llvm-readelf -x 14 user_ringbuf_success.bpf.o
  Hex dump of section '.llvm_jump_table_sizes':
  0x00000000 20000000 00000000 04000000 00000000  ...............
You can see. There are two jump tables:
  jump table 1: offset 0, size 4 (4 labels)
  jump table 2: offset 0x20, size 4 (4 labels)

Check sections 9 and 15, we can find the corresponding section:
  Relocation section '.rel.llvm_jump_table_sizes' at offset 0x38e0 contains 1 entries:
      Offset             Info             Type               Symbol's Value  Symbol's Name
  0000000000000000  0000000a00000002 R_BPF_64_ABS64         0000000000000000 .rodata
  Relocation section '.rel.llvm_jump_table_sizes' at offset 0x3970 contains 1 entries:
      Offset             Info             Type               Symbol's Value  Symbol's Name
  0000000000000000  0000000a00000002 R_BPF_64_ABS64         0000000000000000 .rodata
and confirmed that the relocation is against '.rodata'.

Dump .rodata section:
  0x00000000 a8000000 00000000 10010000 00000000 ................
  0x00000010 b8000000 00000000 c8000000 00000000 ................
  0x00000020 28040000 00000000 00050000 00000000 (...............
  0x00000030 70040000 00000000 b8040000 00000000 p...............
  0x00000040 44726169 6e207265 7475726e 65643a20 Drain returned:

So we can get two jump tables:
  .rodata offset 0, # of lables 4:
  0x00000000 a8000000 00000000 10010000 00000000 ................
  0x00000010 b8000000 00000000 c8000000 00000000 ................
  .rodata offset 0x200, # of lables 4:
  0x00000020 28040000 00000000 00050000 00000000 (...............
  0x00000030 70040000 00000000 b8040000 00000000 p...............

This way, you just need to scan related code section. As long as it
matches one of jump tables (.rodata relocation, offset also matching),
you do not need to care about gotox at all in libbpf.
---
 llvm/include/llvm/CodeGen/AsmPrinter.h     | 2 ++
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 2 +-
 llvm/lib/Target/BPF/BPFAsmPrinter.cpp      | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index 9a8f2d5e398e7..69a5547408d1c 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cstdint>
 #include <memory>
@@ -33,6 +34,7 @@
 #include <vector>
 
 namespace llvm {
+extern cl::opt<bool> EmitJumpTableSizesSection;
 
 class AddrLabelMap;
 class AsmPrinterHandler;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index fdb81b05d9490..7b237b3851aa0 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -168,7 +168,7 @@ static cl::opt<bool> BBAddrMapSkipEmitBBEntries(
              "unnecessary for some PGOAnalysisMap features."),
     cl::Hidden, cl::init(false));
 
-static cl::opt<bool> EmitJumpTableSizesSection(
+cl::opt<bool> llvm::EmitJumpTableSizesSection(
     "emit-jump-table-sizes-section",
     cl::desc("Emit a section containing jump table addresses and sizes"),
     cl::Hidden, cl::init(false));
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index 5dd71cc91427a..e2856bab354c8 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -57,6 +57,7 @@ class BPFAsmPrinter : public AsmPrinter {
 } // namespace
 
 bool BPFAsmPrinter::doInitialization(Module &M) {
+  EmitJumpTableSizesSection = true;
   AsmPrinter::doInitialization(M);
 
   // Only emit BTF when debuginfo available.

>From 2ba950a656f62dba94e705565c29d11a2e7fe6d0 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song at linux.dev>
Date: Wed, 2 Apr 2025 11:12:15 -0700
Subject: [PATCH 4/5] Handle jump table change in machine-sink pass

The implementation is similar to func getJumpTableIndex() in X86InstrInfo.cpp.

For the following example:
    struct simple_ctx {
	int x;
	int y;
	int z;
    };

    int ret_user, ret_user2;
    void bar(void);

    int foo(struct simple_ctx *ctx, struct simple_ctx *ctx2)
    {
        switch (ctx->x) {
        case 1:
                ret_user = 8;
                break;
        case 6:
                ret_user = 3;
                break;
        case 2:
                ret_user = 4;
                break;
        case 31:
                ret_user = 5;
                break;
        default:
                ret_user = 19;
                break;
        }

	bar();

        switch (ctx2->x) {
        case 0:
                ret_user2 = 8;
                break;
        case 7:
                ret_user2 = 3;
                break;
        case 9:
                ret_user2 = 4;
                break;
        case 31:
                ret_user2 = 5;
                break;
        default:
                ret_user2 = 29;
                break;
        }

        return 0;
    }

Before machine-sink pass,
Jump Tables:
%jump-table.0: %bb.5 %bb.2 %bb.4 %bb.4 %bb.4 %bb.1 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.3
%jump-table.1: %bb.10 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.6 %bb.9 %bb.7 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.8

Machine-level IR:
bb.0.entry:
  successors: %bb.4(0x0ccccccb), %bb.11(0x73333335); %bb.4(10.00%), %bb.11(90.00%)
  liveins: $r1, $r2
  %3:gpr = COPY $r2
  %2:gpr = COPY $r1
  %4:gpr32 = MOV_ri_32 8
  %6:gpr32 = LDW32 %2:gpr, 0 :: (load (s32) from %ir.ctx, !tbaa !3)
  %7:gpr32 = ADD_ri_32 %6:gpr32(tied-def 0), -1
  %5:gpr = MOV_32_64 %7:gpr32
  JUGT_ri_32 %7:gpr32, 30, %bb.4

bb.11.entry:
; predecessors: %bb.0
  successors: %bb.5(0x1c71c71c), %bb.2(0x1c71c71c), %bb.4(0x0e38e38e), %bb.1(0x1c71c71c), %bb.3(0x1c71c71c); %bb.5(22.22%), %bb.2(22.22%), %bb.4(11.11%), %bb.1(22.22%), %bb.3(22.22%)

  %8:gpr = SLL_ri %5:gpr(tied-def 0), 3
  %9:gpr = LD_imm64 %jump-table.0
  %10:gpr = ADD_rr %9:gpr(tied-def 0), killed %8:gpr
  %11:gpr = LDD killed %10:gpr, 0 :: (load (s64) from jump-table)
  JX killed %11:gpr

bb.1.sw.bb1:
; predecessors: %bb.11
  successors: %bb.5(0x80000000); %bb.5(100.00%)

  %14:gpr32 = MOV_ri_32 3
  JMP %bb.5

bb.2.sw.bb2:
; predecessors: %bb.11
  successors: %bb.5(0x80000000); %bb.5(100.00%)

  %13:gpr32 = MOV_ri_32 4
  JMP %bb.5

bb.3.sw.bb3:
; predecessors: %bb.11
  successors: %bb.5(0x80000000); %bb.5(100.00%)

  %12:gpr32 = MOV_ri_32 5
  JMP %bb.5

bb.4.sw.default:
; predecessors: %bb.11, %bb.0
  successors: %bb.5(0x80000000); %bb.5(100.00%)

  %15:gpr32 = MOV_ri_32 19

bb.5.sw.epilog:
...

After machine-sink pass:
Jump Tables:
%jump-table.0: %bb.13 %bb.2 %bb.4 %bb.4 %bb.4 %bb.1 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.4 %bb.3
%jump-table.1: %bb.14 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.6 %bb.9 %bb.7 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.9 %bb.8

Machine-level IR:
bb.0.entry:
  successors: %bb.4(0x0ccccccb), %bb.11(0x73333335); %bb.4(10.00%), %bb.11(90.00%)
  liveins: $r1, $r2
  %3:gpr = COPY $r2
  %2:gpr = COPY $r1
  %6:gpr32 = LDW32 %2:gpr, 0 :: (load (s32) from %ir.ctx, !tbaa !3)
  %7:gpr32 = ADD_ri_32 %6:gpr32(tied-def 0), -1
  JUGT_ri_32 %7:gpr32, 30, %bb.4

bb.11.entry:
; predecessors: %bb.0
  successors: %bb.13(0x1c71c71c), %bb.2(0x1c71c71c), %bb.4(0x0e38e38e), %bb.1(0x1c71c71c), %bb.3(0x1c71c71c); %bb.13(22.22%), %bb.2(22.22%), %bb.4(11.11%), %bb.1(22.22%), %bb.3(22.22%)

  %5:gpr = MOV_32_64 %7:gpr32
  %8:gpr = SLL_ri %5:gpr(tied-def 0), 3
  %9:gpr = LD_imm64 %jump-table.0
  %10:gpr = ADD_rr %9:gpr(tied-def 0), killed %8:gpr
  %11:gpr = LDD killed %10:gpr, 0 :: (load (s64) from jump-table)
  JX killed %11:gpr

bb.13:
; predecessors: %bb.11
  successors: %bb.5(0x80000000); %bb.5(100.00%)

  %4:gpr32 = MOV_ri_32 8
  JMP %bb.5
bb.1.sw.bb1:
; predecessors: %bb.11
  successors: %bb.5(0x80000000); %bb.5(100.00%)

  %14:gpr32 = MOV_ri_32 3
  JMP %bb.5

bb.2.sw.bb2:
; predecessors: %bb.11
  successors: %bb.5(0x80000000); %bb.5(100.00%)

  %13:gpr32 = MOV_ri_32 4
  JMP %bb.5

bb.3.sw.bb3:
; predecessors: %bb.11
  successors: %bb.5(0x80000000); %bb.5(100.00%)

  %12:gpr32 = MOV_ri_32 5
  JMP %bb.5

bb.4.sw.default:
; predecessors: %bb.11, %bb.0
  successors: %bb.5(0x80000000); %bb.5(100.00%)

  %15:gpr32 = MOV_ri_32 19

bb.5.sw.epilog:

Before machine-sink pass, '%4:gpr32 = MOV_ri_32 8' is in the entry block so
there is no switch-branch for value 8. But machine-sink pass later removed
'%4:gpr32 = MOV_ri_32 8' and add back the switch-branch with value 8. Such
transformation requires adjust the jump table. This commit implemented
backend callback function getJumpTableIndex() so jump table can be properly
updated.
---
 llvm/lib/Target/BPF/BPFInstrInfo.cpp | 37 ++++++++++++++++++++++++++++
 llvm/lib/Target/BPF/BPFInstrInfo.h   |  3 +++
 2 files changed, 40 insertions(+)

diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index e61aa62c88f26..78626c39e80f7 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -263,3 +263,40 @@ unsigned BPFInstrInfo::removeBranch(MachineBasicBlock &MBB,
 
   return Count;
 }
+
+int BPFInstrInfo::getJumpTableIndex(const MachineInstr &MI) const {
+  // The pattern looks like:
+  // %0 = LD_imm64 %jump-table.0   ; load jump-table address
+  // %1 = ADD_rr %0, $another_reg  ; address + offset
+  // %2 = LDD %1, 0                ; load the actual label
+  // JX %2
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  Register Reg = MI.getOperand(0).getReg();
+  if (!Reg.isVirtual())
+    return -1;
+  MachineInstr *Ldd = MRI.getUniqueVRegDef(Reg);
+  if (Ldd == nullptr || Ldd->getOpcode() != BPF::LDD)
+    return -1;
+
+  Reg = Ldd->getOperand(1).getReg();
+  if (!Reg.isVirtual())
+    return -1;
+  MachineInstr *Add = MRI.getUniqueVRegDef(Reg);
+  if (Add == nullptr || Add->getOpcode() != BPF::ADD_rr)
+    return -1;
+
+  Reg = Add->getOperand(1).getReg();
+  if (!Reg.isVirtual())
+    return -1;
+  MachineInstr *LDimm64 = MRI.getUniqueVRegDef(Reg);
+  if (LDimm64 == nullptr || LDimm64->getOpcode() != BPF::LD_imm64)
+    return -1;
+
+  const MachineOperand &MO = LDimm64->getOperand(1);
+  if (!MO.isJTI())
+    return -1;
+
+  return MO.getIndex();
+}
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.h b/llvm/lib/Target/BPF/BPFInstrInfo.h
index d8bbad44e314e..d88e37975980a 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.h
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.h
@@ -58,6 +58,9 @@ class BPFInstrInfo : public BPFGenInstrInfo {
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+
+  int getJumpTableIndex(const MachineInstr &MI) const override;
+
 private:
   void expandMEMCPY(MachineBasicBlock::iterator) const;
 

>From 2fe93dc077fcde5bdc60a90f6335d29369f617ff Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song at linux.dev>
Date: Sun, 11 May 2025 11:32:46 -0700
Subject: [PATCH 5/5] Ensure gotox to be the start of the asm insn.

  Also remove gotol to be in the middle of asm insn.
---
 llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 2e4819e5ede38..034bf14c53f53 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -232,6 +232,7 @@ struct BPFOperand : public MCParsedAsmOperand {
         .Case("callx", true)
         .Case("goto", true)
         .Case("gotol", true)
+        .Case("gotox", true)
         .Case("may_goto", true)
         .Case("*", true)
         .Case("exit", true)
@@ -261,7 +262,6 @@ struct BPFOperand : public MCParsedAsmOperand {
         .Case("bswap32", true)
         .Case("bswap64", true)
         .Case("goto", true)
-        .Case("gotol", true)
         .Case("ll", true)
         .Case("skb", true)
         .Case("s", true)