[llvm] [RISCV][WIP] Branch to Absolute Address (PR #133555)

Sam Elliott via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 28 19:42:31 PDT 2025


https://github.com/lenary created https://github.com/llvm/llvm-project/pull/133555

Today, if you feed e.g. `beq a0, a1, 60` into LLVM's assembler, you get back something like this: (i.e., the `60` is directly used in the encoding).

```
00000000 <.text>:
       0: <encoding> beq a0, a1, 60 <.text+60>
```

If you feed the same into binutils, you get back something like this (i.e., the `60` is treated like an absolute address):
```
00000000 <.text>:
       0: <encoding> bne a0, a1, 8 <.text+0x8>
       4: <encoding> j 4 <.text+0x4>
             R_RISCV_JAL *ABS*+0x3c
```

[Craig reminded me of this recently](https://github.com/llvm/llvm-project/pull/131996#discussion_r2004045898) and Ana mentioned that there had been previous discussions about this, but did not recall the resolution.

---

There are drawbacks to how binutils works. It cannot really ever say a specific immediate is "wrong", because only the linker resolves these, whereas in LLVM we can catch some issues like that a bit earlier, but obviously we're catching issues in something subtly different.

---

This PR is a pile of hacks to make LLVM's assembler work like binutils. None of this code is particularly nice, but I hope some of it might be a good place to start discussing whether we want to make this change? (I didn't find a previous discussion of this in the issue tracker, but maybe I didn't look hard enough)

There are almost certainly nicer ways to do this, but this is the way I managed today. There are almost certainly some bugs here too, especially in the printing code.



>From 0f9ff50c5a54743256de21c0eed4145a7f886deb Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott at quicinc.com>
Date: Fri, 28 Mar 2025 19:22:19 -0700
Subject: [PATCH] [RISCV][WIP] Branch to Absolute Address

These are some prospective hacks hacks to make `beq r1, r2, <addr>` work
as it does in binutils - that is, `<addr>` is treated as absolute,
rather than relative to the current instruction.

None of this code is particularly nice, but I hope some of it might be a
good place to start discussing whether we want to make this change?
---
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 66 ++++++++++++++++---
 .../RISCV/MCTargetDesc/RISCVInstPrinter.cpp   | 32 ++++++---
 llvm/lib/Target/RISCV/RISCVInstrInfo.td       | 25 ++++++-
 3 files changed, 105 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 31fe02a88e146..ab0305a21d62c 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -196,6 +196,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
   ParseStatus parseCSRSystemRegister(OperandVector &Operands);
   ParseStatus parseFPImm(OperandVector &Operands);
   ParseStatus parseImmediate(OperandVector &Operands);
+  ParseStatus parseBranchOffsetImmediate(OperandVector &Operands);
   ParseStatus parseRegister(OperandVector &Operands, bool AllowParens = false);
   ParseStatus parseMemOpBaseReg(OperandVector &Operands);
   ParseStatus parseZeroOffsetMemOp(OperandVector &Operands);
@@ -556,6 +557,10 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     return IsValid && VK == RISCVMCExpr::VK_None;
   }
 
+  bool isBranchOffset() const {
+    return isImm();
+  }
+
   // Predicate methods for AsmOperands defined in RISCVInstrInfo.td
 
   bool isBareSymbol() const {
@@ -2009,6 +2014,49 @@ ParseStatus RISCVAsmParser::parseImmediate(OperandVector &Operands) {
   return ParseStatus::Success;
 }
 
+ParseStatus RISCVAsmParser::parseBranchOffsetImmediate(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  SMLoc E;
+  const MCExpr *Res;
+
+  switch (getLexer().getKind()) {
+  default:
+    return ParseStatus::NoMatch;
+  case AsmToken::LParen:
+  case AsmToken::Dot:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+  case AsmToken::Exclaim:
+  case AsmToken::Tilde:
+  case AsmToken::Integer:
+  case AsmToken::String:
+  case AsmToken::Identifier: {
+    if (getParser().parseExpression(Res, E))
+      return ParseStatus::Failure;
+
+    // If we're already a symbol-based expression, just return.
+    // This ends up covering expressions like `. + <offset>`
+    if (Res->getKind() != MCExpr::Constant)
+      break;
+
+    // HAX: Create an absolute symbol with value zero, and add the constant.
+    const MCExpr *Zero = MCConstantExpr::create(0, getContext());
+    MCSymbol *AbsSym = getContext().createTempSymbol();
+    // AbsSym->setVariableValue(Zero);
+    getStreamer().emitAssignment(AbsSym, Zero);
+
+    Res = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(AbsSym, getContext()), Res, getContext());
+    break;
+  }
+  case AsmToken::Percent: {
+    return parseOperandWithSpecifier(Operands);
+  }
+  }
+
+  Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
+  return ParseStatus::Success;
+}
+
 ParseStatus RISCVAsmParser::parseOperandWithSpecifier(OperandVector &Operands) {
   SMLoc S = getLoc();
   SMLoc E;
@@ -2132,20 +2180,20 @@ ParseStatus RISCVAsmParser::parsePseudoJumpSymbol(OperandVector &Operands) {
 }
 
 ParseStatus RISCVAsmParser::parseJALOffset(OperandVector &Operands) {
-  // Parsing jal operands is fiddly due to the `jal foo` and `jal ra, foo`
-  // both being acceptable forms. When parsing `jal ra, foo` this function
-  // will be called for the `ra` register operand in an attempt to match the
-  // single-operand alias. parseJALOffset must fail for this case. It would
-  // seem logical to try parse the operand using parseImmediate and return
+  // Parsing jal operands is fiddly due to the `jal foo` and `jal ra, foo` both
+  // being acceptable forms. When parsing `jal ra, foo` this function will be
+  // called for the `ra` register operand in an attempt to match the
+  // single-operand alias. parseJALOffset must fail for this case. It would seem
+  // logical to try parse the operand using parseBranchImmediate and return
   // NoMatch if the next token is a comma (meaning we must be parsing a jal in
-  // the second form rather than the first). We can't do this as there's no
-  // way of rewinding the lexer state. Instead, return NoMatch if this operand
-  // is an identifier and is followed by a comma.
+  // the second form rather than the first). We can't do this as there's no way
+  // of rewinding the lexer state. Instead, return NoMatch if this operand is an
+  // identifier and is followed by a comma.
   if (getLexer().is(AsmToken::Identifier) &&
       getLexer().peekTok().is(AsmToken::Comma))
     return ParseStatus::NoMatch;
 
-  return parseImmediate(Operands);
+  return parseBranchOffsetImmediate(Operands);
 }
 
 bool RISCVAsmParser::parseVTypeToken(const AsmToken &Tok, VTypeState &State,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index cd2322cc5b26d..1dab205a43144 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
@@ -102,17 +103,32 @@ void RISCVInstPrinter::printBranchOperand(const MCInst *MI, uint64_t Address,
                                           const MCSubtargetInfo &STI,
                                           raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNo);
+
+  if (MO.isExpr()) {
+    // Don't mind me, just need to rifle through some of these expressions to
+    // find out if it is absolute symbol reference to an opaque zero
+    if (const auto *BE = dyn_cast<MCBinaryExpr>(MO.getExpr())) {
+      if (const auto *SRE = dyn_cast<MCSymbolRefExpr>(BE->getLHS())) {
+        if (const auto *SymVal = dyn_cast<MCConstantExpr>(SRE->getSymbol().getVariableValue(/*false*/))) {
+          if (BE->getOpcode() == MCBinaryExpr::Add && SymVal->getValue() == 0) {
+            BE->getRHS()->print(O, &MAI);
+            return;
+          }
+        }
+      }
+    }
+
+    MO.getExpr()->print(O, &MAI);
+  }
+
+
   if (!MO.isImm())
     return printOperand(MI, OpNo, STI, O);
 
-  if (PrintBranchImmAsAddress) {
-    uint64_t Target = Address + MO.getImm();
-    if (!STI.hasFeature(RISCV::Feature64Bit))
-      Target &= 0xffffffff;
-    markup(O, Markup::Target) << formatHex(Target);
-  } else {
-    markup(O, Markup::Target) << formatImm(MO.getImm());
-  }
+  uint64_t Target = Address + MO.getImm();
+  if (!STI.hasFeature(RISCV::Feature64Bit))
+    Target &= 0xffffffff;
+  markup(O, Markup::Target) << formatHex(Target);
 }
 
 void RISCVInstPrinter::printCSRSystemRegister(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index c9386f2307175..615c3e806f12e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -275,6 +275,29 @@ def simm12 : RISCVSImmLeafOp<12> {
 def simm12_no6 : ImmLeaf<XLenVT, [{
   return isInt<12>(Imm) && !isInt<6>(Imm) && isInt<12>(-Imm);}]>;
 
+
+def BranchOffsetAsmOperand : AsmOperandClass {
+  let Name = "BranchOffset";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseBranchOffsetImmediate";
+  let DiagnosticType = !strconcat("Invalid", Name);
+  let DiagnosticString = "operand must be an absolute address or an expression";
+}
+
+def branch_offset : Operand<OtherVT> {
+  let ParserMatchClass = BranchOffsetAsmOperand;
+  let PrintMethod = "printBranchOperand";
+  let EncoderMethod = "getImmOpValueAsr1";
+  let DecoderMethod = "decodeSImmOperandAndLsl1<13>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isShiftedInt<12, 1>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+  let OperandType = "OPERAND_PCREL";
+}
+
 // A 13-bit signed immediate where the least significant bit is zero.
 def bare_simm13_lsb0 : Operand<OtherVT> {
   let ParserMatchClass = BareSImmNLsb0AsmOperand<13>;
@@ -532,7 +555,7 @@ include "RISCVInstrFormatsV.td"
 
 class BranchCC_rri<bits<3> funct3, string opcodestr>
     : RVInstB<funct3, OPC_BRANCH, (outs),
-              (ins GPR:$rs1, GPR:$rs2, bare_simm13_lsb0:$imm12),
+              (ins GPR:$rs1, GPR:$rs2, branch_offset:$imm12),
               opcodestr, "$rs1, $rs2, $imm12">,
       Sched<[WriteJmp, ReadJmp, ReadJmp]> {
   let isBranch = 1;



More information about the llvm-commits mailing list