Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

The complete tokenisation grammar

The machine-readable Pest grammar for tokenisation is presented here for convenience.

See Parsing Expression Grammars for an explanation of the notation.

This version of the grammar uses Pest's PUSH, PEEK, and POP for the Raw_double_quoted_literal definitions.

ANY, PATTERN_WHITE_SPACE, XID_START, and XID_CONTINUE are built in to Pest and so not defined below.

LF, DOUBLEQUOTE, and BACKSLASH are treated as special terminals in this writeup, but they are not built in to Pest so they have definitions below using character-sequence terminals which include escapes.

TOKENS_2015 = { TOKEN_2015 * }
TOKENS_2021 = { TOKEN_2021 * }
TOKENS_2024 = { TOKEN_2024 * }

TOKEN_2015 = {
    Whitespace |
    Line_comment |
    Block_comment |
    Unterminated_block_comment |
    Character_literal |
    Byte_literal |
    String_literal |
    Byte_string_literal |
    Raw_string_literal |
    Raw_byte_string_literal |
    Unterminated_literal_2015 |
    Reserved_single_quoted_literal_2015 |
    Float_literal |
    Reserved_float |
    Integer_literal |
    Lifetime_or_label |
    Raw_ident |
    Reserved_prefix_2015 |
    Ident |
    Punctuation
}

TOKEN_2021 = {
    Whitespace |
    Line_comment |
    Block_comment |
    Unterminated_block_comment |
    Character_literal |
    Byte_literal |
    String_literal |
    Byte_string_literal |
    C_string_literal |
    Raw_string_literal |
    Raw_byte_string_literal |
    Raw_c_string_literal |
    Reserved_literal_2021 |
    Reserved_single_quoted_literal_2021 |
    Float_literal |
    Reserved_float |
    Integer_literal |
    Raw_lifetime_or_label |
    Reserved_lifetime_or_label_prefix |
    Lifetime_or_label |
    Raw_ident |
    Reserved_prefix_2021 |
    Ident |
    Punctuation
}

TOKEN_2024 = {
    Whitespace |
    Line_comment |
    Block_comment |
    Unterminated_block_comment |
    Character_literal |
    Byte_literal |
    String_literal |
    Byte_string_literal |
    C_string_literal |
    Raw_string_literal |
    Raw_byte_string_literal |
    Raw_c_string_literal |
    Reserved_literal_2021 |
    Reserved_single_quoted_literal_2021 |
    Reserved_guard |
    Float_literal |
    Reserved_float |
    Integer_literal |
    Raw_lifetime_or_label |
    Reserved_lifetime_or_label_prefix |
    Lifetime_or_label |
    Raw_ident |
    Reserved_prefix_2021 |
    Ident |
    Punctuation
}


Whitespace = { PATTERN_WHITE_SPACE + }

Line_comment = { "//" ~ LINE_COMMENT_CONTENT }
LINE_COMMENT_CONTENT = { ( !LF ~ ANY )* }

Block_comment = { BLOCK_COMMENT }
BLOCK_COMMENT = { "/*" ~ BLOCK_COMMENT_CONTENT ~ "*/" }
BLOCK_COMMENT_CONTENT = { ( BLOCK_COMMENT | !"*/" ~ !"/*" ~ ANY ) * }

Unterminated_block_comment = { "/*" }


SQ_REMAINDER = {
    "'" ~ SQ_CONTENT ~ "'" ~
    SUFFIX ?
}
SQ_CONTENT = {
    BACKSLASH ~ ANY ~ ( !"'" ~ ANY ) * |
    !"'" ~ ANY
}

Character_literal = { SQ_REMAINDER }

Byte_literal = { "b" ~ SQ_REMAINDER }


DQ_REMAINDER = {
    DOUBLEQUOTE ~ DQ_CONTENT ~ DOUBLEQUOTE ~
    SUFFIX ?
}
DQ_CONTENT = {
    (
        BACKSLASH ~ ANY |
        !DOUBLEQUOTE ~ ANY
    ) *
}

String_literal = { DQ_REMAINDER }

Byte_string_literal = { "b" ~ DQ_REMAINDER }

C_string_literal = { "c" ~ DQ_REMAINDER }



RAW_DQ_REMAINDER = {
    PUSH(HASHES) ~
    DOUBLEQUOTE ~ RAW_DQ_CONTENT ~ DOUBLEQUOTE ~
    POP ~
    SUFFIX ?
}
RAW_DQ_CONTENT = {
    ( !(DOUBLEQUOTE ~ PEEK) ~ ANY ) *
}
HASHES = { "#" * }

Raw_string_literal = { "r" ~ RAW_DQ_REMAINDER }

Raw_byte_string_literal = { "br" ~ RAW_DQ_REMAINDER }

Raw_c_string_literal = { "cr" ~ RAW_DQ_REMAINDER }


Unterminated_literal_2015 = { "r" ~ DOUBLEQUOTE | "br" ~ DOUBLEQUOTE | "b'" }
Reserved_literal_2021 = { IDENT ~ ( DOUBLEQUOTE | "'" ) }

Reserved_single_quoted_literal_2015 = { "'" ~ IDENT ~ "'" }
Reserved_single_quoted_literal_2021 = { "'" ~ "r#" ? ~ IDENT ~ "'" }


Reserved_guard = { "##" | "#" ~ DOUBLEQUOTE }


DECIMAL_DIGITS = { ('0'..'9' | "_") * }
HEXADECIMAL_DIGITS = { ('0'..'9' | 'a'..'f' | 'A'..'F' | "_") * }
LOW_BASE_TOKEN_DIGITS = { DECIMAL_DIGITS }
DECIMAL_PART = { '0'..'9' ~ DECIMAL_DIGITS }


Float_literal = {
    FLOAT_BODY_WITH_EXPONENT ~ SUFFIX ? |
    FLOAT_BODY_WITHOUT_EXPONENT ~ !("e"|"E") ~ SUFFIX ? |
    FLOAT_BODY_WITH_FINAL_DOT ~ !"." ~ !IDENT_START
}

FLOAT_BODY_WITH_EXPONENT = {
    DECIMAL_PART ~ ("." ~ DECIMAL_PART ) ? ~
    ("e"|"E") ~ ("+"|"-") ? ~ EXPONENT_DIGITS
}
EXPONENT_DIGITS = { "_" * ~ '0'..'9' ~ DECIMAL_DIGITS }

FLOAT_BODY_WITHOUT_EXPONENT = {
    DECIMAL_PART ~ "." ~ DECIMAL_PART
}

FLOAT_BODY_WITH_FINAL_DOT = {
    DECIMAL_PART ~ "."
}

Reserved_float = {
    RESERVED_FLOAT_EMPTY_EXPONENT | RESERVED_FLOAT_BASED
}
RESERVED_FLOAT_EMPTY_EXPONENT = {
    DECIMAL_PART ~ ("." ~ DECIMAL_PART ) ? ~
    ("e"|"E") ~ ("+"|"-") ?
}
RESERVED_FLOAT_BASED = {
    (
        ("0b" | "0o") ~ LOW_BASE_TOKEN_DIGITS |
        "0x" ~ HEXADECIMAL_DIGITS
    )  ~  (
        ("e"|"E") |
        "." ~ !"." ~ !IDENT_START
    )
}


Integer_literal = {
    ( INTEGER_BINARY_LITERAL |
      INTEGER_OCTAL_LITERAL |
      INTEGER_HEXADECIMAL_LITERAL |
      INTEGER_DECIMAL_LITERAL ) ~
    SUFFIX_NO_E ?
}

INTEGER_BINARY_LITERAL = { "0b" ~ LOW_BASE_TOKEN_DIGITS }
INTEGER_OCTAL_LITERAL = { "0o" ~ LOW_BASE_TOKEN_DIGITS }
INTEGER_HEXADECIMAL_LITERAL = { "0x" ~ HEXADECIMAL_DIGITS }
INTEGER_DECIMAL_LITERAL = { DECIMAL_PART }

SUFFIX_NO_E = { !("e"|"E") ~ SUFFIX }


Raw_lifetime_or_label = { "'r#" ~ IDENT }

Reserved_lifetime_or_label_prefix = { "'" ~ IDENT ~ "#" }

Lifetime_or_label = { "'" ~ IDENT }

Raw_ident = { "r#" ~ IDENT }

Reserved_prefix_2015 = { "r#" | "br#" }
Reserved_prefix_2021 = { IDENT ~ "#" }

Ident = { IDENT }


Punctuation = {
    ";" |
    "," |
    "." |
    "(" |
    ")" |
    "{" |
    "}" |
    "[" |
    "]" |
    "@" |
    "#" |
    "~" |
    "?" |
    ":" |
    "$" |
    "=" |
    "!" |
    "<" |
    ">" |
    "-" |
    "&" |
    "|" |
    "+" |
    "*" |
    "/" |
    "^" |
    "%"
}


SUFFIX = { IDENT }
IDENT = { IDENT_START ~ XID_CONTINUE * }
IDENT_START = { XID_START | "_" }


LF = { "\n" }
DOUBLEQUOTE = { "\"" }
BACKSLASH = { "\\" }