[llvm] [MC][ARM] Fix crash when assembling Thumb 'movs r0, #foo'. (PR #115026)

Simon Tatham via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 5 09:11:42 PST 2024


https://github.com/statham-arm created https://github.com/llvm/llvm-project/pull/115026

If the assembler sees this instruction, understanding `foo` to be an external symbol, there's no relocation it can write that will put the whole value of `foo` into the 8-bit immediate field of the 16-bit Thumb add instruction. So it should report an error message pointing at the source line, and in LLVM 18, it did exactly that. But now the error is not reported, due to an indexing error in the operand list in `validateInstruction`, and instead the code continues to attempt assembly, ending up aborting at the `llvm_unreachable` at the end of `getHiLoImmOpValue`.

In this commit I've fixed the index in the `ARM::tMOVi8` case of `validateInstruction`, and also the one for `tADDi8` which must cope with either the 2- or 3-operand form in the input assembly source. But also, while writing the test, I found that if you assemble for Armv7-M instead of Armv6-M, the instruction has opcode `t2ADDri` when it goes through `validateInstruction`, and only turns into `tMOVi8` later in `processInstruction`. Then it's too late for `validateInstruction` to report that error. So I've introduced a new `revalidateInstruction` which is run _after_ modifying the instruction, and repeated the same check there.

>From 1af3d37aa383c24aa97e030a7e77a0c9275a3167 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Mon, 4 Nov 2024 09:09:14 +0000
Subject: [PATCH] [MC][ARM] Fix crash when assembling Thumb 'movs r0,#foo'.

If the assembler sees this instruction, understanding 'foo' to be an
external symbol, there's no relocation it can write that will put the
whole value of 'foo' into the 8-bit immediate field of the 16-bit
Thumb add instruction. So it should report an error message pointing
at the source line, and in LLVM 18, it did exactly that. But now the
error is not reported, due to an indexing error in the operand list in
`validateInstruction`, and instead the code continues to attempt
assembly, ending up aborting at the `llvm_unreachable` at the end of
`getHiLoImmOpValue`.

In this commit I've fixed the index in the `ARM::tMOVi8` case of
`validateInstruction`, and also the one for `tADDi8` which must cope
with either the 2- or 3-operand form in the input assembly source. But
also, while writing the test, I found that if you assemble for Armv7-M
instead of Armv6-M, the instruction has opcode `t2ADDri` when it goes
through `validateInstruction`, and only turns into `tMOVi8` later in
`processInstruction`. Then it's too late for `validateInstruction` to
report that error. So I've introduced a new `revalidateInstruction`
which is run _after_ modifying the instruction, and repeated the same
check there.
---
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 74 ++++++++++++++++---
 llvm/test/MC/ARM/lower-upper-errors.s         | 20 +++++
 2 files changed, 85 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/MC/ARM/lower-upper-errors.s

diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 0df1c336a22146..8e3e27dec76ff1 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -659,6 +659,8 @@ class ARMAsmParser : public MCTargetAsmParser {
 
   bool validateInstruction(MCInst &Inst, const OperandVector &Ops,
                            unsigned MnemonicOpsEndInd);
+  bool revalidateInstruction(MCInst &Inst, const OperandVector &Ops,
+                             unsigned MnemonicOpsEndInd, unsigned OrigOpcode);
   bool processInstruction(MCInst &Inst, const OperandVector &Ops,
                           unsigned MnemonicOpsEndInd, MCStreamer &Out);
   bool shouldOmitVectorPredicateOperand(StringRef Mnemonic,
@@ -8294,7 +8296,9 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::tADDi8: {
-    MCParsedAsmOperand &Op = *Operands[MnemonicOpsEndInd + 1];
+    int i = (Operands[MnemonicOpsEndInd + 1]->isImm()) ? MnemonicOpsEndInd + 1
+                                                       : MnemonicOpsEndInd + 2;
+    MCParsedAsmOperand &Op = *Operands[i];
     if (isARMMCExpr(Op) && !isThumbI8Relocation(Op))
       return Error(Op.getStartLoc(),
                    "Immediate expression for Thumb adds requires :lower0_7:,"
@@ -8302,7 +8306,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::tMOVi8: {
-    MCParsedAsmOperand &Op = *Operands[MnemonicOpsEndInd];
+    MCParsedAsmOperand &Op = *Operands[MnemonicOpsEndInd + 1];
     if (isARMMCExpr(Op) && !isThumbI8Relocation(Op))
       return Error(Op.getStartLoc(),
                    "Immediate expression for Thumb movs requires :lower0_7:,"
@@ -8653,6 +8657,41 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   return false;
 }
 
+// After processInstruction has transformed an instruction being assembled into
+// a different opcode, do any further validity checks that the new opcode
+// depends on.
+//
+// `Inst` contains the final modified form of the instruction, but `Operands`
+// contains the parsed operands from the _original_ instruction, because
+// nothing has updated them (`processInstruction` received them as const).
+// `OrigOpcode` contains the original value of `Inst.getOpcode()`, which should
+// give enough context to know how to understand the original operands list.
+bool ARMAsmParser::revalidateInstruction(MCInst &Inst,
+                                         const OperandVector &Operands,
+                                         unsigned MnemonicOpsEndInd,
+                                         unsigned OrigOpcode) {
+  const unsigned NewOpcode = Inst.getOpcode();
+
+  if (OrigOpcode == ARM::t2ADDri && NewOpcode == ARM::tADDi8) {
+    // t2ADDri is the Thumb 32-bit immediate add instruction, for example
+    // 'add[s] r0,r1,#0xff00'. If its immediate argument isn't a constant
+    // requiring shifting, then processInstruction can turn it into tADDi8, the
+    // simpler 16-bit Thumb immediate add (provided all the other conditions
+    // for that transformation are met). That makes it too late for
+    // validateInstruction to do this check, which it would have done if it had
+    // known from the start that the instruction was tADDi8.
+    int i = (Operands[MnemonicOpsEndInd + 1]->isImm()) ? MnemonicOpsEndInd + 1
+                                                       : MnemonicOpsEndInd + 2;
+    MCParsedAsmOperand &Op = *Operands[i];
+    if (isARMMCExpr(Op) && !isThumbI8Relocation(Op))
+      return Error(Op.getStartLoc(),
+                   "Immediate expression for Thumb adds requires :lower0_7:,"
+                   " :lower8_15:, :upper0_7: or :upper8_15:");
+  }
+
+  return false;
+}
+
 static unsigned getRealVSTOpcode(unsigned Opc, unsigned &Spacing) {
   switch(Opc) {
   default: llvm_unreachable("unexpected opcode!");
@@ -11399,7 +11438,7 @@ bool ARMAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
 
   switch (MatchResult) {
-  case Match_Success:
+  case Match_Success: {
     LLVM_DEBUG(dbgs() << "Parsed as: ";
                Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode()));
                dbgs() << "\n");
@@ -11414,15 +11453,31 @@ bool ARMAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       return true;
     }
 
-    {
-      // Some instructions need post-processing to, for example, tweak which
-      // encoding is selected. Loop on it while changes happen so the
-      // individual transformations can chain off each other. E.g.,
-      // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
-      while (processInstruction(Inst, Operands, MnemonicOpsEndInd, Out))
+    // Some instructions need post-processing to, for example, tweak which
+    // encoding is selected. Loop on it while changes happen so the individual
+    // transformations can chain off each other. E.g.,
+    // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
+    //
+    // This is written as a do-while inside an if, instead of the more obvious
+    // while loop, so that after postprocessing has completed the revised
+    // instruction can be revalidated, but (to save time) only if any changes
+    // had to be made at all.
+    unsigned OrigOpcode = Inst.getOpcode();
+    if (processInstruction(Inst, Operands, MnemonicOpsEndInd, Out)) {
+      do {
         LLVM_DEBUG(dbgs() << "Changed to: ";
                    Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode()));
                    dbgs() << "\n");
+      } while (processInstruction(Inst, Operands, MnemonicOpsEndInd, Out));
+
+      MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+      if (revalidateInstruction(Inst, Operands, MnemonicOpsEndInd,
+                                OrigOpcode)) {
+        // As above, advance IT/VPT positions if we're exiting early.
+        forwardITPosition();
+        forwardVPTPosition();
+        return true;
+      }
     }
 
     // Only move forward at the very end so that everything in validate
@@ -11445,6 +11500,7 @@ bool ARMAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       Out.emitInstruction(Inst, getSTI());
     }
     return false;
+  }
   case Match_NearMisses:
     ReportNearMisses(NearMisses, IDLoc, Operands);
     return true;
diff --git a/llvm/test/MC/ARM/lower-upper-errors.s b/llvm/test/MC/ARM/lower-upper-errors.s
new file mode 100644
index 00000000000000..34fa3c7155ba1e
--- /dev/null
+++ b/llvm/test/MC/ARM/lower-upper-errors.s
@@ -0,0 +1,20 @@
+// RUN: not llvm-mc --triple thumbv6m -show-encoding %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc --triple thumbv7m -show-encoding %s 2>&1 | FileCheck %s --check-prefixes=CHECK,THUMB2
+
+// Check reporting of errors of the form "you should have used
+// :lower16: in this immediate field".
+
+// CHECK: :[[@LINE+1]]:10: error: Immediate expression for Thumb movs requires :lower0_7:, :lower8_15:, :upper0_7: or :upper8_15:
+movs r0, #foo
+
+// CHECK: :[[@LINE+1]]:10: error: Immediate expression for Thumb adds requires :lower0_7:, :lower8_15:, :upper0_7: or :upper8_15:
+adds r0, #foo
+
+// CHECK: :[[@LINE+1]]:14: error: Immediate expression for Thumb adds requires :lower0_7:, :lower8_15:, :upper0_7: or :upper8_15:
+adds r0, r0, #foo
+
+// THUMB2: :[[@LINE+1]]:10: error: immediate expression for mov requires :lower16: or :upper16
+movw r0, #foo
+
+// THUMB2: :[[@LINE+1]]:10: error: immediate expression for mov requires :lower16: or :upper16
+movt r0, #foo



More information about the llvm-commits mailing list