[PATCH] MIR Serialization: Start serializing machine operands by serializing physical register operands.

Thu Jun 18 17:02:37 PDT 2015

> On 2015-Jun-18, at 15:30, Alex Lorenz <arphaman at gmail.com> wrote:
> 
> Update this patch to use the new lexing approach that was introduces in the updated patch for http://reviews.llvm.org/D10521.
> 
> 
> REPOSITORY
>  rL LLVM
> 
> http://reviews.llvm.org/D10525
> 
> Files:
>  lib/CodeGen/MIRParser/MILexer.cpp
>  lib/CodeGen/MIRParser/MILexer.h
>  lib/CodeGen/MIRParser/MIParser.cpp
>  lib/CodeGen/MIRPrinter.cpp
>  test/CodeGen/MIR/X86/expected-machine-operand.mir
>  test/CodeGen/MIR/X86/missing-comma.mir
>  test/CodeGen/MIR/X86/named-registers.mir
>  test/CodeGen/MIR/X86/unknown-register.mir
> 
> EMAIL PREFERENCES
>  http://reviews.llvm.org/settings/panel/emailpreferences/
> <D10525.27967.patch>

I have a few comments below.  Also, I was hoping Ahmed (or someone else
with MIR-level familiarity) could comment on the case-insensitive
register names.  Are we safer here than with instruction opcodes?

> Index: lib/CodeGen/MIRParser/MILexer.cpp
> ===================================================================
> --- lib/CodeGen/MIRParser/MILexer.cpp
> +++ lib/CodeGen/MIRParser/MILexer.cpp
> @@ -67,6 +67,33 @@
>    return C;
>  }
>  
> +static Cursor lexPercent(Cursor C, MIToken &Token) {
> +  auto Range = C;
> +  C.advance(); // Skip '%'
> +  while (isIdentifierChar(C.peek()))
> +    C.advance();
> +  Token = MIToken(MIToken::NamedRegister, Range.upto(C));
> +  return C;
> +}
> +
> +static MIToken::TokenKind symbolToken(char C) {
> +  switch (C) {
> +  case ',':
> +    return MIToken::comma;
> +  case '=':
> +    return MIToken::equal;
> +  default:
> +    return MIToken::Error;
> +  }
> +}
> +
> +static Cursor lexSymbol(Cursor C, MIToken::TokenKind Kind, MIToken &Token) {
> +  auto Range = C;
> +  C.advance();
> +  Token = MIToken(Kind, Range.upto(C));
> +  return C;
> +}
> +
>  MILexer::MILexer(StringRef Source) : CurrentSource(Source) {}
>  
>  StringRef MILexer::lexToken(MIToken &Token) {
> @@ -79,6 +106,11 @@
>    auto Char = C.peek();
>    if (isalpha(Char) || Char == '_')
>      return lexIdentifier(C, Token).remaining();
> +  if (Char == '%')
> +    return lexPercent(C, Token).remaining();
> +  MIToken::TokenKind Kind = symbolToken(Char);
> +  if (Kind != MIToken::Error)
> +    return lexSymbol(C, Kind, Token).remaining();
>    error(C.location(), Twine("unexpected character '") + Twine(Char) + "'");
>    return C.remaining();
>  }
> Index: lib/CodeGen/MIRParser/MILexer.h
> ===================================================================
> --- lib/CodeGen/MIRParser/MILexer.h
> +++ lib/CodeGen/MIRParser/MILexer.h
> @@ -26,8 +26,13 @@
>      Eof,
>      Error,
>  
> +    // Tokens with no info.
> +    comma,
> +    equal,
> +
>      // Identifier tokens
> -    Identifier
> +    Identifier,
> +    NamedRegister
>    };
>  
>  private:
> @@ -41,6 +46,8 @@
>  
>    bool isError() const { return Kind == Error; }
>  
> +  bool isRegister() const { return Kind == NamedRegister; }
> +
>    bool is(TokenKind K) const { return Kind == K; }
>  
>    bool isNot(TokenKind K) const { return Kind != K; }
> Index: lib/CodeGen/MIRParser/MIParser.cpp
> ===================================================================
> --- lib/CodeGen/MIRParser/MIParser.cpp
> +++ lib/CodeGen/MIRParser/MIParser.cpp
> @@ -34,6 +34,8 @@
>    MIToken Token;
>    /// Maps from instruction names to op codes.
>    StringMap<unsigned> Names2InstrOpCodes;
> +  /// Maps from register names to registers.
> +  StringMap<unsigned> Names2Regs;
>  
>  public:
>    MIParser(SourceMgr &SM, MachineFunction &MF, SMDiagnostic &Error,
> @@ -50,14 +52,26 @@
>  
>    MachineInstr *parse();
>  
> +  bool parseRegister(unsigned &Reg);
> +
> +  bool parseRegisterOperand(MachineOperand &Dest, bool IsDef = false);
> +
> +  bool parseMachineOperand(MachineOperand &Dest);

If you're not adding comments (I don't think you need them, since the
names and parameters are fairly clear), I think you should kill the
newlines between these functions.

> +
>  private:
>    void initNames2InstrOpCodes();
>  
>    /// Try to convert an instruction name to an opcode. Return true if the
>    /// instruction name is invalid.
>    bool parseInstrName(StringRef InstrName, unsigned &OpCode);
>  
>    bool parseInstruction(unsigned &OpCode);
> +
> +  void initNames2Regs();
> +
> +  /// Try to convert a register name to a register number. Return true if the
> +  /// register name is invalid.
> +  bool getRegisterByName(StringRef RegName, unsigned &Reg);
>  };
>  
>  } // end anonymous namespace
> @@ -87,13 +101,61 @@
>  MachineInstr *MIParser::parse() {
>    lex();
>  
> +  // Parse any register operands before '='
> +  // TODO: Allow parsing of multiple operands before '='
> +  MachineOperand MO = MachineOperand::CreateImm(0);
> +  SmallVector<MachineOperand, 8> Operands;
> +  if (Token.isRegister()) {
> +    if (parseRegisterOperand(MO, /*IsDef=*/true))
> +      return nullptr;
> +    Operands.push_back(MO);
> +    if (Token.isNot(MIToken::equal)) {
> +      error("expected '='");
> +      return nullptr;
> +    }
> +    lex();
> +  }
> +
>    unsigned OpCode;
>    if (Token.isError() || parseInstruction(OpCode))
>      return nullptr;
>  
> -  // TODO: Parse the rest of instruction - machine operands, etc.
> +  // TODO: Parse the instruction flags and memory operands.
> +
> +  // Parse the remaining machine operands.
> +  while (Token.isNot(MIToken::Eof)) {
> +    if (parseMachineOperand(MO))
> +      return nullptr;
> +    Operands.push_back(MO);
> +    if (Token.is(MIToken::Eof))
> +      break;
> +    if (Token.isNot(MIToken::comma)) {
> +      error("expected ',' before the next machine operand");
> +      return nullptr;
> +    }
> +    lex();
> +  }
> +
>    const auto &MCID = MF.getSubtarget().getInstrInfo()->get(OpCode);
> -  auto *MI = MF.CreateMachineInstr(MCID, DebugLoc());
> +
> +  // Verify machine operands

Add a `.` at the end of the sentence.

> +  if (!MCID.isVariadic()) {
> +    for (size_t I = 0, E = Operands.size(); I < E; ++I) {
> +      if (I < MCID.getNumOperands())
> +        continue;
> +      // Mark this register as implicit to prevent an assertion when it's added
> +      // to an instruction. This is a temporary workaround until the implicit
> +      // register flag can be parsed.
> +      Operands[I].setImplicit();
> +    }
> +  }
> +
> +  // TODO: Determine the implicit behaviour when implicit register flags are
> +  // parsed.
> +  auto *MI = MF.CreateMachineInstr(MCID, DebugLoc(), /*NoImplicit=*/true);
> +  for (const auto &Operand : Operands) {
> +    MI->addOperand(MF, Operand);
> +  }

Probably don't need the braces?

>    return MI;
>  }
>  
> @@ -103,6 +165,46 @@
>    StringRef InstrName = Token.stringValue();
>    if (parseInstrName(InstrName, OpCode))
>      return error(Twine("unknown machine instruction name '") + InstrName + "'");
> +  lex();
> +  return false;
> +}
> +
> +bool MIParser::parseRegister(unsigned &Reg) {
> +  switch (Token.kind()) {
> +  case MIToken::NamedRegister: {
> +    StringRef Name = Token.stringValue().drop_front(1); // Drop the '%'
> +    if (getRegisterByName(Name, Reg))
> +      return error(Twine("unknown register name '") + Name + "'");
> +    break;
> +  }
> +  // TODO: Parse other register kinds.
> +  default:
> +    llvm_unreachable("The current token should be a register");
> +  }
> +  return false;
> +}
> +
> +bool MIParser::parseRegisterOperand(MachineOperand &Dest, bool IsDef) {
> +  unsigned Reg;
> +  // TODO: Parse register flags

Add a `.`.

> +  if (parseRegister(Reg))
> +    return true;
> +  lex();
> +  // TODO: Parse subregister.
> +  Dest = MachineOperand::CreateReg(Reg, IsDef);
> +  return false;
> +}
> +
> +bool MIParser::parseMachineOperand(MachineOperand &Dest) {
> +  switch (Token.kind()) {
> +  case MIToken::NamedRegister:
> +    return parseRegisterOperand(Dest);
> +  case MIToken::Error:
> +    return true;
> +  default:
> +    // TODO: parse the other machine operands.
> +    return error("expected a machine operand");
> +  }
>    return false;
>  }
>  
> @@ -125,6 +227,24 @@
>    return false;
>  }
>  
> +void MIParser::initNames2Regs() {
> +  if (!Names2Regs.empty())
> +    return;
> +  const auto *TRI = MF.getSubtarget().getRegisterInfo();
> +  assert(TRI && "Expected target register info");
> +  for (unsigned I = 0, E = TRI->getNumRegs(); I < E; ++I)
> +    Names2Regs.insert(std::make_pair(StringRef(TRI->getName(I)).lower(), I));

Is this safe for registers?  (Ahmed?)

Anyway, we should at least check:

    for (unsigned I = 0, E = TRI->getNumRegs(); I < E; ++I) {
      bool WasInserted =
          Names2Regs.insert(std::make_pair(StringRef(TRI->getName(I)).lower(), I)).second;
      (void)WasInserted;
      assert(WasInserted && "Expected registers to be unique case-insensitively");
    }

Or maybe there's somewhere we can put this check to catch it at tablegen
time.

> +}
> +
> +bool MIParser::getRegisterByName(StringRef RegName, unsigned &Reg) {
> +  initNames2Regs();
> +  auto RegInfo = Names2Regs.find(RegName);
> +  if (RegInfo == Names2Regs.end())
> +    return true;
> +  Reg = RegInfo->getValue();
> +  return false;
> +}
> +
>  MachineInstr *llvm::parseMachineInstr(SourceMgr &SM, MachineFunction &MF,
>                                        StringRef Src, SMDiagnostic &Error) {
>    return MIParser(SM, MF, Error, Src).parse();
> Index: lib/CodeGen/MIRPrinter.cpp
> ===================================================================
> --- lib/CodeGen/MIRPrinter.cpp
> +++ lib/CodeGen/MIRPrinter.cpp
> @@ -50,6 +50,8 @@
>    MIPrinter(raw_ostream &OS) : OS(OS) {}
>  
>    void print(const MachineInstr &MI);
> +

I don't think this newline adds any value.

> +  void print(const MachineOperand &Op, const TargetRegisterInfo *TRI);
>  };
>  
>  } // end anonymous namespace
> @@ -110,10 +112,57 @@
>  
>  void MIPrinter::print(const MachineInstr &MI) {
>    const auto &SubTarget = MI.getParent()->getParent()->getSubtarget();
> +  const auto *TRI = SubTarget.getRegisterInfo();
> +  assert(TRI && "Expected target register info");
>    const auto *TII = SubTarget.getInstrInfo();
>  
> +  unsigned I = 0, E = MI.getNumOperands();
> +  for (; I < E && MI.getOperand(I).isReg() && MI.getOperand(I).isDef() &&
> +         !MI.getOperand(I).isImplicit();
> +       ++I) {
> +    if (I)
> +      OS << ", ";
> +    print(MI.getOperand(I), TRI);
> +  }
> +
> +  if (I)
> +    OS << " = ";
>    OS << StringRef(TII->getName(MI.getOpcode())).lower();
> -  // TODO: Print the instruction flags, machine operands, machine mem operands.
> +  // TODO: Print the instruction flags, machine mem operands.
> +  if (I < E)
> +    OS << ' ';
> +
> +  bool NeedComma = false;
> +  for (; I < E; ++I) {
> +    if (NeedComma)
> +      OS << ", ";
> +    print(MI.getOperand(I), TRI);
> +    NeedComma = true;
> +  }
> +}
> +
> +static void printReg(unsigned Reg, raw_ostream &OS,
> +                     const TargetRegisterInfo *TRI) {
> +  // TODO: Print Stack Slots
> +  // TODO: Print no register
> +  // TODO: Print virtual registers
> +  if (Reg < TRI->getNumRegs())
> +    OS << '%' << StringRef(TRI->getName(Reg)).lower();
> +  else
> +    llvm_unreachable("Can't print this kind of register yet");
> +}
> +
> +void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI) {
> +  switch (Op.getType()) {
> +  case MachineOperand::MO_Register:
> +    // TODO: Print register flags.
> +    printReg(Op.getReg(), OS, TRI);
> +    // TODO: Print sub register.
> +    break;
> +  default:
> +    // TODO: Print the other machine operands.
> +    llvm_unreachable("Can't print this machine operand at the moment");
> +  }
>  }
>  
>  void llvm::printMIR(raw_ostream &OS, const Module &M) {
> Index: test/CodeGen/MIR/X86/expected-machine-operand.mir
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/MIR/X86/expected-machine-operand.mir
> @@ -0,0 +1,20 @@
> +# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
> +
> +--- |
> +
> +  define i32 @foo() {
> +  entry:
> +    ret i32 0
> +  }
> +
> +...
> +---
> +name:            foo
> +body:
> + - name:         entry
> +   instructions:
> +     # CHECK: 1:16: expected a machine operand

Just noticed this error doesn't give the correct line number, just a
local line number for the instruction.  Why not?  I guess I missed this
in the earlier instruction patches?  (As long as it's fixable, I don't
think it's urgent to fix immediately, but it's kind of broken.)

> +     - '%eax = xor32rr ='
> +     - 'retq %eax'
> +...
> +
> Index: test/CodeGen/MIR/X86/missing-comma.mir
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/MIR/X86/missing-comma.mir
> @@ -0,0 +1,20 @@
> +# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
> +
> +--- |
> +
> +  define i32 @foo() {
> +  entry:
> +    ret i32 0
> +  }
> +
> +...
> +---
> +name:            foo
> +body:
> + - name:         entry
> +   instructions:
> +     # CHECK: 1:21: expected ',' before the next machine operand
> +     - '%eax = xor32rr %eax %eflags'
> +     - 'retq %eax'
> +...
> +
> Index: test/CodeGen/MIR/X86/named-registers.mir
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/MIR/X86/named-registers.mir
> @@ -0,0 +1,22 @@
> +# RUN: llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s
> +# This test ensures that the MIR parser parses X86 registers coorectly.
> +
> +--- |
> +
> +  define i32 @foo() {
> +  entry:
> +    ret i32 0
> +  }
> +
> +...
> +---
> +# CHECK: name: foo
> +name:            foo
> +body:
> + - name:         entry
> +   instructions:
> +     # CHECK:      - '%eax = mov32r0
> +     # CHECK-NEXT: - 'retq %eax
> +     - '%eax = mov32r0'
> +     - 'retq %eax'
> +...
> Index: test/CodeGen/MIR/X86/unknown-register.mir
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/MIR/X86/unknown-register.mir
> @@ -0,0 +1,21 @@
> +# RUN: not llc -march=x86-64 -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
> +# This test ensures that an error is reported when an unknown register is
> +# encountered.
> +
> +--- |
> +
> +  define i32 @foo() {
> +  entry:
> +    ret i32 0
> +  }
> +
> +...
> +---
> +name:            foo
> +body:
> + - name:         entry
> +   instructions:
> +     # CHECK: 1:1: unknown register name 'xax'
> +     - '%xax = mov32r0'
> +     - 'retq %xax'
> +...
>