The complete pretokenisation grammar

The machine-readable Pest grammar for pretokenisation is presented here for convenience.

See Parsing Expression Grammar notation for an explanation of the notation.

This version of the grammar uses Pest's PUSH, PEEK, and POP for the Raw_double_quoted_literal definitions.

ANY, PATTERN_WHITE_SPACE, XID_START, and XID_CONTINUE are built in to Pest and so not defined below.

PRETOKEN_2015 = {
    Whitespace |
    Line_comment |
    Block_comment |
    Unterminated_block_comment |
    Single_quoted_literal |
    Double_quoted_literal_2015 |
    Raw_double_quoted_literal_2015 |
    Unterminated_literal_2015 |
    Float_literal_1 |
    Reserved_float_empty_exponent |
    Reserved_float_e_suffix_restriction |
    Float_literal_2 |
    Reserved_float_based |
    Reserved_integer_e_suffix_restriction |
    Integer_literal |
    Lifetime_or_label |
    Raw_identifier |
    Reserved_prefix_2015 |
    Identifier |
    Punctuation
}

PRETOKEN_2021 = {
    Whitespace |
    Line_comment |
    Block_comment |
    Unterminated_block_comment |
    Single_quoted_literal |
    Double_quoted_literal_2021 |
    Raw_double_quoted_literal_2021 |
    Reserved_literal_2021 |
    Float_literal_1 |
    Reserved_float_empty_exponent |
    Reserved_float_e_suffix_restriction |
    Float_literal_2 |
    Reserved_float_based |
    Reserved_integer_e_suffix_restriction |
    Integer_literal |
    Raw_lifetime_or_label_2021 |
    Reserved_lifetime_or_label_prefix_2021 |
    Lifetime_or_label |
    Raw_identifier |
    Reserved_prefix_2021 |
    Identifier |
    Punctuation
}

PRETOKEN_2024 = {
    Whitespace |
    Line_comment |
    Block_comment |
    Unterminated_block_comment |
    Single_quoted_literal |
    Double_quoted_literal_2021 |
    Raw_double_quoted_literal_2021 |
    Reserved_literal_2021 |
    Reserved_guard_2024 |
    Float_literal_1 |
    Reserved_float_empty_exponent |
    Reserved_float_e_suffix_restriction |
    Float_literal_2 |
    Reserved_float_based |
    Reserved_integer_e_suffix_restriction |
    Integer_literal |
    Raw_lifetime_or_label_2021 |
    Reserved_lifetime_or_label_prefix_2021 |
    Lifetime_or_label |
    Raw_identifier |
    Reserved_prefix_2021 |
    Identifier |
    Punctuation
}


IDENT = { IDENT_START ~ XID_CONTINUE * }
IDENT_START = { XID_START | "_" }
SUFFIX = { IDENT }


Whitespace = { PATTERN_WHITE_SPACE + }

Line_comment = { "//" ~ LINE_COMMENT_CONTENT }
LINE_COMMENT_CONTENT = { ( !"\n" ~ ANY )* }

Block_comment = { "/*" ~ BLOCK_COMMENT_CONTENT ~ "*/" }
BLOCK_COMMENT_CONTENT = { ( Block_comment | !"*/" ~ !"/*" ~ ANY ) * }

Unterminated_block_comment = { "/*" }


Single_quoted_literal = {
    SQ_PREFIX ~
    "'" ~ SQ_CONTENT ~ "'" ~
    SUFFIX ?
}

SQ_PREFIX = { "b" ? }

SQ_CONTENT = {
    "\\" ~ ANY ~ ( !"'" ~ ANY ) * |
    !"'" ~ ANY
}


Double_quoted_literal_2015 = { DQ_PREFIX_2015 ~ DQ_REMAINDER }
Double_quoted_literal_2021 = { DQ_PREFIX_2021 ~ DQ_REMAINDER }

DQ_PREFIX_2015 = { "b" ? }
DQ_PREFIX_2021 = { ( "b" | "c" ) ? }

DQ_REMAINDER = {
    "\"" ~ DQ_CONTENT ~ "\"" ~
    SUFFIX ?
}
DQ_CONTENT = {
    (
        "\\" ~ ANY |
        !"\"" ~ ANY
    ) *
}


Raw_double_quoted_literal_2015 = { RAW_DQ_PREFIX_2015 ~ RAW_DQ_REMAINDER }
Raw_double_quoted_literal_2021 = { RAW_DQ_PREFIX_2021 ~ RAW_DQ_REMAINDER }

RAW_DQ_PREFIX_2015 = { "r" | "br" }
RAW_DQ_PREFIX_2021 = { "r" | "br" | "cr" }

RAW_DQ_REMAINDER = {
    PUSH(HASHES) ~
    "\"" ~ RAW_DQ_CONTENT ~ "\"" ~
    POP ~
    SUFFIX ?
}
RAW_DQ_CONTENT = {
    ( !("\"" ~ PEEK) ~ ANY ) *
}
HASHES = { "#" {0, 255} }

Unterminated_literal_2015 = { "r\"" | "br\"" | "b'" }
Reserved_literal_2021 = { IDENT ~ ( "\"" | "'" ) }

Reserved_guard_2024 = { "##" | "#\"" }

DECIMAL_DIGITS = { ('0'..'9' | "_") * }
HEXADECIMAL_DIGITS = { ('0'..'9' | 'a' .. 'f' | 'A' .. 'F' | "_") * }
LOW_BASE_PRETOKEN_DIGITS = { DECIMAL_DIGITS }
DECIMAL_PART = { '0'..'9' ~ DECIMAL_DIGITS }

RESTRICTED_E_SUFFIX = { ("e"|"E") ~ "_"+ ~ !XID_START ~ XID_CONTINUE }


Float_literal_1 = {
    FLOAT_BODY_WITH_EXPONENT ~ SUFFIX ?
}
Float_literal_2 = {
    FLOAT_BODY_WITHOUT_EXPONENT ~ SUFFIX ? |
    FLOAT_BODY_WITH_FINAL_DOT ~ !"." ~ !IDENT_START
}

FLOAT_BODY_WITH_EXPONENT = {
    DECIMAL_PART ~ ("." ~ DECIMAL_PART ) ? ~
    ("e"|"E") ~ ("+"|"-") ? ~ EXPONENT_DIGITS
}
EXPONENT_DIGITS = { "_" * ~ '0'..'9' ~ DECIMAL_DIGITS }

FLOAT_BODY_WITHOUT_EXPONENT = {
    DECIMAL_PART ~ "." ~ DECIMAL_PART
}

FLOAT_BODY_WITH_FINAL_DOT = {
    DECIMAL_PART ~ "."
}

Reserved_float_empty_exponent = {
    DECIMAL_PART ~ ("." ~ DECIMAL_PART ) ? ~
    ("e"|"E") ~ ("+"|"-")
}
Reserved_float_e_suffix_restriction = {
    DECIMAL_PART ~ "." ~ DECIMAL_PART ~
    RESTRICTED_E_SUFFIX
}
Reserved_float_based = {
    (
        ("0b" | "0o") ~ LOW_BASE_PRETOKEN_DIGITS |
        "0x" ~ HEXADECIMAL_DIGITS
    )  ~  (
        ("e"|"E") ~ ("+"|"-" | EXPONENT_DIGITS) |
        "." ~ !"." ~ !IDENT_START
    )
}

Reserved_integer_e_suffix_restriction = {
    ( INTEGER_BINARY_LITERAL |
      INTEGER_OCTAL_LITERAL |
      INTEGER_DECIMAL_LITERAL ) ~
    RESTRICTED_E_SUFFIX
}

Integer_literal = {
    ( INTEGER_BINARY_LITERAL |
      INTEGER_OCTAL_LITERAL |
      INTEGER_HEXADECIMAL_LITERAL |
      INTEGER_DECIMAL_LITERAL ) ~
    SUFFIX ?
}

INTEGER_BINARY_LITERAL = { "0b" ~ LOW_BASE_PRETOKEN_DIGITS }
INTEGER_OCTAL_LITERAL = { "0o" ~ LOW_BASE_PRETOKEN_DIGITS }
INTEGER_HEXADECIMAL_LITERAL = { "0x" ~ HEXADECIMAL_DIGITS }
INTEGER_DECIMAL_LITERAL = { DECIMAL_PART }


Raw_lifetime_or_label_2021 = { "'r#" ~ IDENT ~ !"'" }

Reserved_lifetime_or_label_prefix_2021 = { "'" ~ IDENT ~ "#" }

Lifetime_or_label = { "'" ~ IDENT ~ !"'" }

Raw_identifier = { "r#" ~ IDENT }

Reserved_prefix_2015 = { "r#" | "br#" }
Reserved_prefix_2021 = { IDENT ~ "#" }

Identifier = { IDENT }


Punctuation = {
    ";" |
    "," |
    "." |
    "(" |
    ")" |
    "{" |
    "}" |
    "[" |
    "]" |
    "@" |
    "#" |
    "~" |
    "?" |
    ":" |
    "$" |
    "=" |
    "!" |
    "<" |
    ">" |
    "-" |
    "&" |
    "|" |
    "+" |
    "*" |
    "/" |
    "^" |
    "%"
}