[llvm] [MC][ARM] Fix crash when assembling Thumb 'movs r0, #foo'. (PR #115026)

Fri Nov 8 07:25:50 PST 2024

https://github.com/statham-arm updated https://github.com/llvm/llvm-project/pull/115026

>From 1af3d37aa383c24aa97e030a7e77a0c9275a3167 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Mon, 4 Nov 2024 09:09:14 +0000
Subject: [PATCH 1/2] [MC][ARM] Fix crash when assembling Thumb 'movs r0,#foo'.

If the assembler sees this instruction, understanding 'foo' to be an
external symbol, there's no relocation it can write that will put the
whole value of 'foo' into the 8-bit immediate field of the 16-bit
Thumb add instruction. So it should report an error message pointing
at the source line, and in LLVM 18, it did exactly that. But now the
error is not reported, due to an indexing error in the operand list in
`validateInstruction`, and instead the code continues to attempt
assembly, ending up aborting at the `llvm_unreachable` at the end of
`getHiLoImmOpValue`.

In this commit I've fixed the index in the `ARM::tMOVi8` case of
`validateInstruction`, and also the one for `tADDi8` which must cope
with either the 2- or 3-operand form in the input assembly source. But
also, while writing the test, I found that if you assemble for Armv7-M
instead of Armv6-M, the instruction has opcode `t2ADDri` when it goes
through `validateInstruction`, and only turns into `tMOVi8` later in
`processInstruction`. Then it's too late for `validateInstruction` to
report that error. So I've introduced a new `revalidateInstruction`
which is run _after_ modifying the instruction, and repeated the same
check there.
---
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 74 ++++++++++++++++---
 llvm/test/MC/ARM/lower-upper-errors.s         | 20 +++++
 2 files changed, 85 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/MC/ARM/lower-upper-errors.s

diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 0df1c336a22146..8e3e27dec76ff1 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -659,6 +659,8 @@ class ARMAsmParser : public MCTargetAsmParser {
 
   bool validateInstruction(MCInst &Inst, const OperandVector &Ops,
                            unsigned MnemonicOpsEndInd);
+  bool revalidateInstruction(MCInst &Inst, const OperandVector &Ops,
+                             unsigned MnemonicOpsEndInd, unsigned OrigOpcode);
   bool processInstruction(MCInst &Inst, const OperandVector &Ops,
                           unsigned MnemonicOpsEndInd, MCStreamer &Out);
   bool shouldOmitVectorPredicateOperand(StringRef Mnemonic,
@@ -8294,7 +8296,9 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::tADDi8: {
-    MCParsedAsmOperand &Op = *Operands[MnemonicOpsEndInd + 1];
+    int i = (Operands[MnemonicOpsEndInd + 1]->isImm()) ? MnemonicOpsEndInd + 1
+                                                       : MnemonicOpsEndInd + 2;
+    MCParsedAsmOperand &Op = *Operands[i];
     if (isARMMCExpr(Op) && !isThumbI8Relocation(Op))
       return Error(Op.getStartLoc(),
                    "Immediate expression for Thumb adds requires :lower0_7:,"
@@ -8302,7 +8306,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::tMOVi8: {
-    MCParsedAsmOperand &Op = *Operands[MnemonicOpsEndInd];
+    MCParsedAsmOperand &Op = *Operands[MnemonicOpsEndInd + 1];
     if (isARMMCExpr(Op) && !isThumbI8Relocation(Op))
       return Error(Op.getStartLoc(),
                    "Immediate expression for Thumb movs requires :lower0_7:,"
@@ -8653,6 +8657,41 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   return false;
 }
 
+// After processInstruction has transformed an instruction being assembled into
+// a different opcode, do any further validity checks that the new opcode
+// depends on.
+//
+// `Inst` contains the final modified form of the instruction, but `Operands`
+// contains the parsed operands from the _original_ instruction, because
+// nothing has updated them (`processInstruction` received them as const).
+// `OrigOpcode` contains the original value of `Inst.getOpcode()`, which should
+// give enough context to know how to understand the original operands list.
+bool ARMAsmParser::revalidateInstruction(MCInst &Inst,
+                                         const OperandVector &Operands,
+                                         unsigned MnemonicOpsEndInd,
+                                         unsigned OrigOpcode) {
+  const unsigned NewOpcode = Inst.getOpcode();
+
+  if (OrigOpcode == ARM::t2ADDri && NewOpcode == ARM::tADDi8) {
+    // t2ADDri is the Thumb 32-bit immediate add instruction, for example
+    // 'add[s] r0,r1,#0xff00'. If its immediate argument isn't a constant
+    // requiring shifting, then processInstruction can turn it into tADDi8, the
+    // simpler 16-bit Thumb immediate add (provided all the other conditions
+    // for that transformation are met). That makes it too late for
+    // validateInstruction to do this check, which it would have done if it had
+    // known from the start that the instruction was tADDi8.
+    int i = (Operands[MnemonicOpsEndInd + 1]->isImm()) ? MnemonicOpsEndInd + 1
+                                                       : MnemonicOpsEndInd + 2;
+    MCParsedAsmOperand &Op = *Operands[i];
+    if (isARMMCExpr(Op) && !isThumbI8Relocation(Op))
+      return Error(Op.getStartLoc(),
+                   "Immediate expression for Thumb adds requires :lower0_7:,"
+                   " :lower8_15:, :upper0_7: or :upper8_15:");
+  }
+
+  return false;
+}
+
 static unsigned getRealVSTOpcode(unsigned Opc, unsigned &Spacing) {
   switch(Opc) {
   default: llvm_unreachable("unexpected opcode!");
@@ -11399,7 +11438,7 @@ bool ARMAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
 
   switch (MatchResult) {
-  case Match_Success:
+  case Match_Success: {
     LLVM_DEBUG(dbgs() << "Parsed as: ";
                Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode()));
                dbgs() << "\n");
@@ -11414,15 +11453,31 @@ bool ARMAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       return true;
     }
 
-    {
-      // Some instructions need post-processing to, for example, tweak which
-      // encoding is selected. Loop on it while changes happen so the
-      // individual transformations can chain off each other. E.g.,
-      // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
-      while (processInstruction(Inst, Operands, MnemonicOpsEndInd, Out))
+    // Some instructions need post-processing to, for example, tweak which
+    // encoding is selected. Loop on it while changes happen so the individual
+    // transformations can chain off each other. E.g.,
+    // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
+    //
+    // This is written as a do-while inside an if, instead of the more obvious
+    // while loop, so that after postprocessing has completed the revised
+    // instruction can be revalidated, but (to save time) only if any changes
+    // had to be made at all.
+    unsigned OrigOpcode = Inst.getOpcode();
+    if (processInstruction(Inst, Operands, MnemonicOpsEndInd, Out)) {
+      do {
         LLVM_DEBUG(dbgs() << "Changed to: ";
                    Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode()));
                    dbgs() << "\n");
+      } while (processInstruction(Inst, Operands, MnemonicOpsEndInd, Out));
+
+      MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+      if (revalidateInstruction(Inst, Operands, MnemonicOpsEndInd,
+                                OrigOpcode)) {
+        // As above, advance IT/VPT positions if we're exiting early.
+        forwardITPosition();
+        forwardVPTPosition();
+        return true;
+      }
     }
 
     // Only move forward at the very end so that everything in validate
@@ -11445,6 +11500,7 @@ bool ARMAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       Out.emitInstruction(Inst, getSTI());
     }
     return false;
+  }
   case Match_NearMisses:
     ReportNearMisses(NearMisses, IDLoc, Operands);
     return true;
diff --git a/llvm/test/MC/ARM/lower-upper-errors.s b/llvm/test/MC/ARM/lower-upper-errors.s
new file mode 100644
index 00000000000000..34fa3c7155ba1e
--- /dev/null
+++ b/llvm/test/MC/ARM/lower-upper-errors.s
@@ -0,0 +1,20 @@
+// RUN: not llvm-mc --triple thumbv6m -show-encoding %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc --triple thumbv7m -show-encoding %s 2>&1 | FileCheck %s --check-prefixes=CHECK,THUMB2
+
+// Check reporting of errors of the form "you should have used
+// :lower16: in this immediate field".
+
+// CHECK: :[[@LINE+1]]:10: error: Immediate expression for Thumb movs requires :lower0_7:, :lower8_15:, :upper0_7: or :upper8_15:
+movs r0, #foo
+
+// CHECK: :[[@LINE+1]]:10: error: Immediate expression for Thumb adds requires :lower0_7:, :lower8_15:, :upper0_7: or :upper8_15:
+adds r0, #foo
+
+// CHECK: :[[@LINE+1]]:14: error: Immediate expression for Thumb adds requires :lower0_7:, :lower8_15:, :upper0_7: or :upper8_15:
+adds r0, r0, #foo
+
+// THUMB2: :[[@LINE+1]]:10: error: immediate expression for mov requires :lower16: or :upper16
+movw r0, #foo
+
+// THUMB2: :[[@LINE+1]]:10: error: immediate expression for mov requires :lower16: or :upper16
+movt r0, #foo

>From bd0cf26632efee6505ade6b9bffdbbeb439e1e5f Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Fri, 8 Nov 2024 13:34:45 +0000
Subject: [PATCH 2/2] Change strategy to avoid converting to narrow add
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I've reverted the part of the previous commit that introduced and used
`revalidateInstruction`, replacing it with a more careful check in
`processInstruction` of whether to transform `t2ADDri` → `tADDi8` in
the first place.

I've kept the index fixes in the existing `validateInstruction`.

Also expanded the testing considerably, including making two test
files to deal with the fact that some of these unemittable
instructions aren't detected until a later phase of assembly, which
won't be reached if there are any errors in the first phase.
---
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 95 +++++--------------
 llvm/test/MC/ARM/lower-upper-errors-2.s       | 18 ++++
 llvm/test/MC/ARM/lower-upper-errors.s         | 45 +++++++--
 3 files changed, 82 insertions(+), 76 deletions(-)
 create mode 100644 llvm/test/MC/ARM/lower-upper-errors-2.s

diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 8e3e27dec76ff1..448a6a92e70ce9 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -659,8 +659,6 @@ class ARMAsmParser : public MCTargetAsmParser {
 
   bool validateInstruction(MCInst &Inst, const OperandVector &Ops,
                            unsigned MnemonicOpsEndInd);
-  bool revalidateInstruction(MCInst &Inst, const OperandVector &Ops,
-                             unsigned MnemonicOpsEndInd, unsigned OrigOpcode);
   bool processInstruction(MCInst &Inst, const OperandVector &Ops,
                           unsigned MnemonicOpsEndInd, MCStreamer &Out);
   bool shouldOmitVectorPredicateOperand(StringRef Mnemonic,
@@ -8657,41 +8655,6 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   return false;
 }
 
-// After processInstruction has transformed an instruction being assembled into
-// a different opcode, do any further validity checks that the new opcode
-// depends on.
-//
-// `Inst` contains the final modified form of the instruction, but `Operands`
-// contains the parsed operands from the _original_ instruction, because
-// nothing has updated them (`processInstruction` received them as const).
-// `OrigOpcode` contains the original value of `Inst.getOpcode()`, which should
-// give enough context to know how to understand the original operands list.
-bool ARMAsmParser::revalidateInstruction(MCInst &Inst,
-                                         const OperandVector &Operands,
-                                         unsigned MnemonicOpsEndInd,
-                                         unsigned OrigOpcode) {
-  const unsigned NewOpcode = Inst.getOpcode();
-
-  if (OrigOpcode == ARM::t2ADDri && NewOpcode == ARM::tADDi8) {
-    // t2ADDri is the Thumb 32-bit immediate add instruction, for example
-    // 'add[s] r0,r1,#0xff00'. If its immediate argument isn't a constant
-    // requiring shifting, then processInstruction can turn it into tADDi8, the
-    // simpler 16-bit Thumb immediate add (provided all the other conditions
-    // for that transformation are met). That makes it too late for
-    // validateInstruction to do this check, which it would have done if it had
-    // known from the start that the instruction was tADDi8.
-    int i = (Operands[MnemonicOpsEndInd + 1]->isImm()) ? MnemonicOpsEndInd + 1
-                                                       : MnemonicOpsEndInd + 2;
-    MCParsedAsmOperand &Op = *Operands[i];
-    if (isARMMCExpr(Op) && !isThumbI8Relocation(Op))
-      return Error(Op.getStartLoc(),
-                   "Immediate expression for Thumb adds requires :lower0_7:,"
-                   " :lower8_15:, :upper0_7: or :upper8_15:");
-  }
-
-  return false;
-}
-
 static unsigned getRealVSTOpcode(unsigned Opc, unsigned &Spacing) {
   switch(Opc) {
   default: llvm_unreachable("unexpected opcode!");
@@ -10748,14 +10711,25 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // the flags are compatible with the current IT status, use encoding T2
     // instead of T3. For compatibility with the system 'as'. Make sure the
     // wide encoding wasn't explicit.
-    if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
-        !isARMLowRegister(Inst.getOperand(0).getReg()) ||
-        (Inst.getOperand(2).isImm() &&
-         (unsigned)Inst.getOperand(2).getImm() > 255) ||
-        Inst.getOperand(5).getReg() !=
-            (inITBlock() ? ARM::NoRegister : ARM::CPSR) ||
-        HasWideQualifier)
-      break;
+    if (HasWideQualifier)
+      break; // source code has asked for the 32-bit instruction
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg())
+      break; // tADDi8 can't take different input and output registers
+    if (!isARMLowRegister(Inst.getOperand(0).getReg()))
+      break; // high register that tADDi8 can't access
+    if (Inst.getOperand(5).getReg() !=
+        (inITBlock() ? ARM::NoRegister : ARM::CPSR))
+      break; // flag-modification would require overriding the IT state
+    if (Inst.getOperand(2).isImm()) {
+      if ((unsigned)Inst.getOperand(2).getImm() > 255)
+        break; // large immediate that tADDi8 can't contain
+    } else {
+      int i = (Operands[MnemonicOpsEndInd + 1]->isImm()) ? MnemonicOpsEndInd + 1
+        : MnemonicOpsEndInd + 2;
+      MCParsedAsmOperand &Op = *Operands[i];
+      if (isARMMCExpr(Op) && !isThumbI8Relocation(Op))
+        break; // a type of non-immediate that tADDi8 can't represent
+    }
     MCInst TmpInst;
     TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ?
                       ARM::tADDi8 : ARM::tSUBi8);
@@ -11438,7 +11412,7 @@ bool ARMAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
 
   switch (MatchResult) {
-  case Match_Success: {
+  case Match_Success:
     LLVM_DEBUG(dbgs() << "Parsed as: ";
                Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode()));
                dbgs() << "\n");
@@ -11453,31 +11427,15 @@ bool ARMAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       return true;
     }
 
-    // Some instructions need post-processing to, for example, tweak which
-    // encoding is selected. Loop on it while changes happen so the individual
-    // transformations can chain off each other. E.g.,
-    // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
-    //
-    // This is written as a do-while inside an if, instead of the more obvious
-    // while loop, so that after postprocessing has completed the revised
-    // instruction can be revalidated, but (to save time) only if any changes
-    // had to be made at all.
-    unsigned OrigOpcode = Inst.getOpcode();
-    if (processInstruction(Inst, Operands, MnemonicOpsEndInd, Out)) {
-      do {
+    {
+      // Some instructions need post-processing to, for example, tweak which
+      // encoding is selected. Loop on it while changes happen so the
+      // individual transformations can chain off each other. E.g.,
+      // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
+      while (processInstruction(Inst, Operands, MnemonicOpsEndInd, Out))
         LLVM_DEBUG(dbgs() << "Changed to: ";
                    Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode()));
                    dbgs() << "\n");
-      } while (processInstruction(Inst, Operands, MnemonicOpsEndInd, Out));
-
-      MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
-      if (revalidateInstruction(Inst, Operands, MnemonicOpsEndInd,
-                                OrigOpcode)) {
-        // As above, advance IT/VPT positions if we're exiting early.
-        forwardITPosition();
-        forwardVPTPosition();
-        return true;
-      }
     }
 
     // Only move forward at the very end so that everything in validate
@@ -11500,7 +11458,6 @@ bool ARMAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       Out.emitInstruction(Inst, getSTI());
     }
     return false;
-  }
   case Match_NearMisses:
     ReportNearMisses(NearMisses, IDLoc, Operands);
     return true;
diff --git a/llvm/test/MC/ARM/lower-upper-errors-2.s b/llvm/test/MC/ARM/lower-upper-errors-2.s
new file mode 100644
index 00000000000000..58c1f117923001
--- /dev/null
+++ b/llvm/test/MC/ARM/lower-upper-errors-2.s
@@ -0,0 +1,18 @@
+// RUN: not llvm-mc --triple thumbv7m -filetype=obj -o /dev/null %s 2>&1 | FileCheck %s
+
+// This test checks reporting of errors of the form "you should have
+// used :lower16: in this immediate field", when the errors are
+// discovered at the object-file output stage by checking the set of
+// available relocations.
+//
+// For errors that are reported earlier, when initially reading the
+// instructions, see lower-upper-errors.s.
+
+// CHECK: [[@LINE+1]]:1: error: unsupported relocation
+adds r0, r0, #foo
+
+// CHECK: [[@LINE+1]]:1: error: unsupported relocation
+add r9, r0, #foo
+
+// CHECK: [[@LINE+1]]:1: error: expected relocatable expression
+movs r11, :upper8_15:#foo
diff --git a/llvm/test/MC/ARM/lower-upper-errors.s b/llvm/test/MC/ARM/lower-upper-errors.s
index 34fa3c7155ba1e..d97432b607cef3 100644
--- a/llvm/test/MC/ARM/lower-upper-errors.s
+++ b/llvm/test/MC/ARM/lower-upper-errors.s
@@ -1,8 +1,12 @@
-// RUN: not llvm-mc --triple thumbv6m -show-encoding %s 2>&1 | FileCheck %s
-// RUN: not llvm-mc --triple thumbv7m -show-encoding %s 2>&1 | FileCheck %s --check-prefixes=CHECK,THUMB2
+// RUN: not llvm-mc --triple thumbv6m %s 2>&1 | FileCheck %s --check-prefixes=CHECK,THUMB1
+// RUN: not llvm-mc --triple thumbv7m %s 2>&1 | FileCheck %s --check-prefixes=CHECK,THUMB2
 
-// Check reporting of errors of the form "you should have used
-// :lower16: in this immediate field".
+// This test checks reporting of errors of the form "you should have
+// used :lower16: in this immediate field", during initial reading of
+// the source file.
+//
+// For errors that are reported at object-file output time, see
+// lower-upper-errors-2.s.
 
 // CHECK: :[[@LINE+1]]:10: error: Immediate expression for Thumb movs requires :lower0_7:, :lower8_15:, :upper0_7: or :upper8_15:
 movs r0, #foo
@@ -10,11 +14,38 @@ movs r0, #foo
 // CHECK: :[[@LINE+1]]:10: error: Immediate expression for Thumb adds requires :lower0_7:, :lower8_15:, :upper0_7: or :upper8_15:
 adds r0, #foo
 
-// CHECK: :[[@LINE+1]]:14: error: Immediate expression for Thumb adds requires :lower0_7:, :lower8_15:, :upper0_7: or :upper8_15:
-adds r0, r0, #foo
-
 // THUMB2: :[[@LINE+1]]:10: error: immediate expression for mov requires :lower16: or :upper16
 movw r0, #foo
 
 // THUMB2: :[[@LINE+1]]:10: error: immediate expression for mov requires :lower16: or :upper16
 movt r0, #foo
+
+// With a Thumb2 wide add instruction available, this case isn't an error
+// while reading the source file. It only causes a problem when an object
+// file is output, and it turns out there's no suitable relocation to ask
+// for the value of an external symbol to be turned into a Thumb shifted
+// immediate field. And in this case the other errors in this source file
+// cause assembly to terminate before reaching the object-file output stage
+// (even if a test run had had -filetype=obj).
+
+// THUMB1: :[[@LINE+2]]:14: error: Immediate expression for Thumb adds requires :lower0_7:, :lower8_15:, :upper0_7: or :upper8_15:
+// THUMB2-NOT: :[[@LINE+1]]:{{[0-9]+}}: error:
+adds r0, r0, #foo
+
+// Similarly for this version, which _must_ be the wide encoding due
+// to the use of a high register and the lack of flag-setting.
+
+// THUMB1: :[[@LINE+2]]:1: error: invalid instruction
+// THUMB2-NOT: :[[@LINE+1]]:{{[0-9]+}}: error:
+add r9, r0, #foo
+
+// CHECK: :[[@LINE+1]]:10: error: Immediate expression for Thumb movs requires :lower0_7:, :lower8_15:, :upper0_7: or :upper8_15:
+movs r0, :lower16:#foo
+
+// This is invalid in either architecture: in Thumb1 it can't use a
+// high register, and in Thumb2 it can't use :upper8_15:. But the
+// Thumb2 case won't cause an error until output.
+
+// THUMB1: :[[@LINE+2]]:1: error: invalid instruction
+// THUMB2-NOT: :[[@LINE+1]]:{{[0-9]+}}: error:
+movs r11, :upper8_15:#foo