// This grammar only concerns the inline structure according to the CommonMark Spec // (https://spec.commonmark.org/0.30/#inlines) // For more information see README.md /// const common = require('../common/common'); // Levels used for dynmic precedence. Ideally // n * PRECEDENCE_LEVEL_EMPHASIS > PRECEDENCE_LEVEL_LINK for any n, so maybe the // maginuted of these values should be increased in the future const PRECEDENCE_LEVEL_EMPHASIS = 1; const PRECEDENCE_LEVEL_LINK = 10; const PRECEDENCE_LEVEL_HTML = 100; // Punctuation characters as specified in // https://github.github.com/gfm/#ascii-punctuation-character const PUNCTUATION_CHARACTERS_REGEX = '!-/:-@\\[-`\\{-~'; // !!! // Notice the call to `add_inline_rules` which generates some additional rules related to parsing // inline contents in different contexts. // !!! module.exports = grammar(add_inline_rules({ name: 'markdown_inline', externals: $ => [ // An `$._error` token is never valid and gets emmited to kill invalid parse branches. Concretely // this is used to decide wether a newline closes a paragraph and together and it gets emitted // when trying to parse the `$._trigger_error` token in `$.link_title`. $._error, $._trigger_error, // Opening and closing delimiters for code spans. These are sequences of one or more backticks. // An opening token does not mean the text after has to be a code span if there is no closing token $._code_span_start, $._code_span_close, // Opening and closing delimiters for emphasis. $._emphasis_open_star, $._emphasis_open_underscore, $._emphasis_close_star, $._emphasis_close_underscore, // For emphasis we need to tell the parser if the last character was a whitespace (or the // beginning of a line) or a punctuation. These tokens never actually get emitted. $._last_token_whitespace, $._last_token_punctuation, $._strikethrough_open, $._strikethrough_close, // Opening and closing delimiters for latex. These are sequences of one or more dollar signs. // An opening token does not mean the text after has to be latex if there is no closing token $._latex_span_start, $._latex_span_close, // Token emmited when encountering opening delimiters for a leaf span // e.g. a code span, that does not have a matching closing span $._unclosed_span ], precedences: $ => [ // [$._strong_emphasis_star, $._inline_element_no_star], [$._strong_emphasis_star_no_link, $._inline_element_no_star_no_link], // [$._strong_emphasis_underscore, $._inline_element_no_underscore], [$._strong_emphasis_underscore_no_link, $._inline_element_no_underscore_no_link], [$.hard_line_break, $._whitespace], [$.hard_line_break, $._text_base], ], // More conflicts are defined in `add_inline_rules` conflicts: $ => [ [$._closing_tag, $._text_base], [$._open_tag, $._text_base], [$._html_comment, $._text_base], [$._processing_instruction, $._text_base], [$._declaration, $._text_base], [$._cdata_section, $._text_base], [$._link_text_non_empty, $._inline_element], [$._link_text_non_empty, $._inline_element_no_star], [$._link_text_non_empty, $._inline_element_no_underscore], [$._link_text_non_empty, $._inline_element_no_tilde], [$._link_text, $._inline_element], [$._link_text, $._inline_element_no_star], [$._link_text, $._inline_element_no_underscore], [$._link_text, $._inline_element_no_tilde], [$._image_description, $._image_description_non_empty, $._text_base], // [$._image_description, $._image_description_non_empty, $._text_inline], // [$._image_description, $._image_description_non_empty, $._text_inline_no_star], // [$._image_description, $._image_description_non_empty, $._text_inline_no_underscore], [$._image_shortcut_link, $._image_description], [$.shortcut_link, $._link_text], [$.link_destination, $.link_title], [$._link_destination_parenthesis, $.link_title], [$.wiki_link, $._inline_element], [$.wiki_link, $._inline_element_no_star], [$.wiki_link, $._inline_element_no_underscore], [$.wiki_link, $._inline_element_no_tilde], ], extras: $ => [], rules: { inline: $ => seq(optional($._last_token_whitespace), $._inline), ...common.rules, // A lot of inlines are defined in `add_inline_rules`, including: // // * collections of inlines // * emphasis // * textual content // // This is done to reduce code duplication, as some inlines need to be parsed differently // depending on the context. For example inlines in ATX headings may not contain newlines. code_span: $ => seq( alias($._code_span_start, $.code_span_delimiter), repeat(choice($._text_base, '[', ']', $._soft_line_break, $._html_tag)), alias($._code_span_close, $.code_span_delimiter) ), latex_block: $ => seq( alias($._latex_span_start, $.latex_span_delimiter), repeat(choice($._text_base, '[', ']', $._soft_line_break, $._html_tag)), alias($._latex_span_close, $.latex_span_delimiter), ), // Different kinds of links: // * inline links (https://github.github.com/gfm/#inline-link) // * full reference links (https://github.github.com/gfm/#full-reference-link) // * collapsed reference links (https://github.github.com/gfm/#collapsed-reference-link) // * shortcut links (https://github.github.com/gfm/#shortcut-reference-link) // // Dynamic precedence is distributed as granular as possible to help the parser decide // while parsing which branch is the most important. // // https://github.github.com/gfm/#links _link_text: $ => prec.dynamic(PRECEDENCE_LEVEL_LINK, choice( $._link_text_non_empty, seq('[', ']') )), _link_text_non_empty: $ => seq('[', alias($._inline_no_link, $.link_text), ']'), shortcut_link: $ => prec.dynamic(PRECEDENCE_LEVEL_LINK, $._link_text_non_empty), full_reference_link: $ => prec.dynamic(2 * PRECEDENCE_LEVEL_LINK, seq( $._link_text, $.link_label )), collapsed_reference_link: $ => prec.dynamic(PRECEDENCE_LEVEL_LINK, seq( $._link_text, '[', ']' )), inline_link: $ => prec.dynamic(PRECEDENCE_LEVEL_LINK, seq( $._link_text, '(', repeat(choice($._whitespace, $._soft_line_break)), optional(seq( choice( seq( $.link_destination, optional(seq( repeat1(choice($._whitespace, $._soft_line_break)), $.link_title )) ), $.link_title, ), repeat(choice($._whitespace, $._soft_line_break)), )), ')' )), wiki_link: $ => prec.dynamic(2 * PRECEDENCE_LEVEL_LINK, seq( '[', '[', alias($._wiki_link_destination, $.link_destination), optional(seq( '|', alias($._wiki_link_text, $.link_text) )), ']', ']' ) ), _wiki_link_destination: $ => repeat1(choice( $._word, common.punctuation_without($, ['[',']', '|']), $._whitespace, )), _wiki_link_text: $ => repeat1(choice( $._word, common.punctuation_without($, ['[',']']), $._whitespace, )), // Images work exactly like links with a '!' added in front. // // https://github.github.com/gfm/#images image: $ => choice( $._image_inline_link, $._image_shortcut_link, $._image_full_reference_link, $._image_collapsed_reference_link ), _image_inline_link: $ => prec.dynamic(PRECEDENCE_LEVEL_LINK, seq( $._image_description, '(', repeat(choice($._whitespace, $._soft_line_break)), optional(seq( choice( seq( $.link_destination, optional(seq( repeat1(choice($._whitespace, $._soft_line_break)), $.link_title )) ), $.link_title, ), repeat(choice($._whitespace, $._soft_line_break)), )), ')' )), _image_shortcut_link: $ => prec.dynamic(3 * PRECEDENCE_LEVEL_LINK, $._image_description_non_empty), _image_full_reference_link: $ => prec.dynamic(PRECEDENCE_LEVEL_LINK, seq($._image_description, $.link_label)), _image_collapsed_reference_link: $ => prec.dynamic(PRECEDENCE_LEVEL_LINK, seq($._image_description, '[', ']')), _image_description: $ => prec.dynamic(3 * PRECEDENCE_LEVEL_LINK, choice($._image_description_non_empty, seq('!', '[', prec(1, ']')))), _image_description_non_empty: $ => seq('!', '[', alias($._inline, $.image_description), prec(1, ']')), // Autolinks. Uri autolinks actually accept protocolls of arbitrary length which does not // align with the spec. This is because the binary for the grammar gets to large if done // otherwise as tree-sitters code generation is not very concise for this type of regex. // // Email autolinks do not match every valid email (emails normally should not be parsed // using regexes), but this is how they are defined in the spec. // // https://github.github.com/gfm/#autolinks uri_autolink: $ => /<[a-zA-Z][a-zA-Z0-9+\.\-][a-zA-Z0-9+\.\-]*:[^ \t\r\n<>]*>/, email_autolink: $ => /<[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>/, // Raw html. As with html blocks we do not emit additional information as this is best done // by a proper html tree-sitter grammar. // // https://github.github.com/gfm/#raw-html _html_tag: $ => choice($._open_tag, $._closing_tag, $._html_comment, $._processing_instruction, $._declaration, $._cdata_section), _open_tag: $ => prec.dynamic(PRECEDENCE_LEVEL_HTML, seq('<', $._tag_name, repeat($._attribute), repeat(choice($._whitespace, $._soft_line_break)), optional('/'), '>')), _closing_tag: $ => prec.dynamic(PRECEDENCE_LEVEL_HTML, seq('<', '/', $._tag_name, repeat(choice($._whitespace, $._soft_line_break)), '>')), _tag_name: $ => seq($._word_no_digit, repeat(choice($._word_no_digit, $._digits, '-'))), _attribute: $ => seq(repeat1(choice($._whitespace, $._soft_line_break)), $._attribute_name, repeat(choice($._whitespace, $._soft_line_break)), '=', repeat(choice($._whitespace, $._soft_line_break)), $._attribute_value), _attribute_name: $ => /[a-zA-Z_:][a-zA-Z0-9_\.:\-]*/, _attribute_value: $ => choice( /[^ \t\r\n"'=<>`]+/, seq("'", repeat(choice($._word, $._whitespace, $._soft_line_break, common.punctuation_without($, ["'"]))), "'"), seq('"', repeat(choice($._word, $._whitespace, $._soft_line_break, common.punctuation_without($, ['"']))), '"'), ), _html_comment: $ => prec.dynamic(PRECEDENCE_LEVEL_HTML, seq( '' )), _processing_instruction: $ => prec.dynamic(PRECEDENCE_LEVEL_HTML, seq( '' )), _declaration: $ => prec.dynamic(PRECEDENCE_LEVEL_HTML, seq( /']), ))), '>' )), _cdata_section: $ => prec.dynamic(PRECEDENCE_LEVEL_HTML, seq( '' )), // A hard line break. // // https://github.github.com/gfm/#hard-line-breaks hard_line_break: $ => seq(choice('\\', $._whitespace_ge_2), $._soft_line_break), _text: $ => choice($._word, common.punctuation_without($, []), $._whitespace), // Whitespace is divided into single whitespaces and multiple whitespaces as wee need this // information for hard line breaks. _whitespace_ge_2: $ => /\t| [ \t]+/, _whitespace: $ => seq(choice($._whitespace_ge_2, / /), optional($._last_token_whitespace)), // Other than whitespace we tokenize into strings of digits, punctuation characters // (handled by `common.punctuation_without`) and strings of any other characters. This way the // lexer does not have to many different states, which makes it a lot easier to make // conflicts work. _word: $ => choice($._word_no_digit, $._digits), _word_no_digit: $ => new RegExp('[^' + PUNCTUATION_CHARACTERS_REGEX + ' \\t\\n\\r0-9]+(_+[^' + PUNCTUATION_CHARACTERS_REGEX + ' \\t\\n\\r0-9]+)*'), _digits: $ => /[0-9][0-9_]*/, _soft_line_break: $ => seq($._newline_token, optional($._last_token_whitespace)), _inline_base: $ => prec.right(repeat1(choice( $.image, $._soft_line_break, $.backslash_escape, $.hard_line_break, $.uri_autolink, $.email_autolink, $.entity_reference, $.numeric_character_reference, (common.EXTENSION_LATEX ? $.latex_block : choice()), $.code_span, alias($._html_tag, $.html_tag), $._text_base, common.EXTENSION_TAGS ? $.tag : choice(), $._unclosed_span, ))), _text_base: $ => choice( $._word, common.punctuation_without($, ['[', ']']), $._whitespace, '