//! HTML Parser //! //! Fork from: //! https://github.com/mathiversen/html-parser/blob/v0.6.3/src/grammar/rules.pest item = _{ SOI ~ comment* ~ doctype? ~ node* ~ EOI } // Other other = { !(comment | node_element | server | text) ~ ANY } WHITESPACE = { " " | "\t" | NEWLINE } /// Code code = ${ "" ~ (!"" ~ ANY)* ~ "" } /// DOCTYPE doctype = { chevron_left_bang ~ doctype_name ~ attr* ~ chevron_right_normal } doctype_name = @{ ^"doctype" } /// SERVER CODE server = ${ chevron_left_server ~ server_code ~ chevron_right_server } server_code = { (!chevron_right_server ~ ANY)* } chevron_left_server = { "<%" } chevron_right_server = { "%>" } /// NODES node = _{ code | server | comment | node_element | text | other } comment = { comment_if | comment_normal } text = ${ (!(chevron_left_normal | comment_tag_start) ~ ANY)+ } // NOTE: Should we be able to write < in text? ^^ node_element = { el_void | el_void_xml | el_process_instruct | javascript_text | style_text | el_raw_text | el_normal | el_dangling } /// COMMENTS comment_normal = _{ comment_tag_start ~ (!comment_tag_end ~ ANY)* ~ comment_tag_end } comment_tag_start = _{ chevron_left_bang ~ "--" } comment_tag_end = _{ "--" ~ chevron_right_normal } /// Compatability with old IE browsers... This is not necessary for newer browsers comment_if = _{ comment_if_start ~ (!comment_if_end ~ ANY)* ~ comment_if_end } comment_if_start = _{ comment_tag_start ~ "[" ~ ^"if" } comment_if_end = _{ chevron_left_bang ~ "[" ~ ^"endif" ~ "]" ~ comment_tag_end } /// ATTRIBUTES attr = { attr_key ~ (equal ~ (attr_non_quoted | attr_quoted))? } attr_quoted = { PUSH(quote) ~ attr_value ~ end_attr_quoted } end_attr_quoted = { POP } attr_non_quoted = @{ !quote ~ (!(WHITESPACE | chevron_right) ~ ANY)* } attr_key = ${ !WHITESPACE ~ (":" | "@" | "#" | ".")? ~ ASCII_ALPHA ~ text_chars* } attr_value = ${ WHITESPACE* ~ (!PEEK ~ ANY)* ~ WHITESPACE* } /// ELEMENTS el_name = @{ ASCII_ALPHA ~ text_chars* } /// Void element aka self-closing element /// Ex:
el_void_name_html = @{ ^"area" | ^"base" | ^"br" | ^"col" | ^"command" | ^"embed" | ^"hr" | ^"img" | ^"input" | ^"keygen" | ^"link" | ^"meta" | ^"param" | ^"source" | ^"track" | ^"wbr" | ^"meta" } // NOTE: This should not have to be a rule, but people doesn't know what void elements are... el_void_name_svg = @{ ^"path" | ^"polygon" | ^"rect" | ^"circle" } el_void_name = @{ el_void_name_html | el_void_name_svg } el_void = _{ chevron_left_normal ~ el_void_name ~ attr* ~ (chevron_right_normal | chevron_right_closed) } el_void_xml = _{ chevron_left_normal ~ el_name ~ attr* ~ chevron_right_closed } /// Open elements are default element that can take children /// and have both a start tag and an end tag /// Ex: el_normal = _{ el_normal_start ~ (!el_normal_end ~ node)* ~ el_normal_end } el_normal_start = _{ chevron_left_normal ~ PUSH(el_name) ~ attr* ~ chevron_right_normal } el_normal_end = _{ chevron_left_closed ~ PUSH(el_name) ~ chevron_right_normal } /// Raw text elements are elements with text/script content that /// might interfere with the normal html syntax el_raw_text_name = { ^"title" | ^"textarea" } el_raw_text_content = ${ (!el_raw_text_end ~ ANY)* } el_raw_text = _{ el_raw_text_start ~ el_raw_text_content ~ el_raw_text_end } el_raw_text_start = _{ chevron_left_normal ~ PUSH(el_raw_text_name) ~ attr* ~ chevron_right_normal } el_raw_text_end = { chevron_left_closed ~ PUSH(el_raw_text_name) ~ chevron_right_normal } inline_javascript = ${ (!javascript_end ~ ANY)* } javascript_text = _{ javascript_start ~ inline_javascript ~ javascript_end } el_javascript_name = @{ ^"script" } javascript_start = _{ chevron_left_normal ~ PUSH(el_javascript_name) ~ attr* ~ chevron_right_normal } javascript_end = { chevron_left_closed ~ PUSH(el_javascript_name) ~ chevron_right_normal } inline_style = ${ (!style_end ~ ANY)* } el_style_name = @{ ^"style" } style_text = _{ style_start ~ inline_style ~ style_end } style_start = _{ chevron_left_normal ~ PUSH(el_style_name) ~ attr* ~ chevron_right_normal } style_end = { chevron_left_closed ~ PUSH(el_style_name) ~ chevron_right_normal } /// XML processing instruction /// Ex: el_process_instruct = { chevron_left_question ~ el_name? ~ attr* ~ chevron_right_question } /// Catch dangling elements /// Ex:
el_dangling = { chevron_left_closed ~ el_name ~ chevron_right_normal } /// SYMBOLS / CHARACTERS text_chars = _{ 'a'..'z' | 'A'..'Z' | "_" | "-" | ":" | '0'..'9' } /// tag left chevron_left_normal = { "<" } chevron_left_closed = { "" } chevron_right_closed = { "/>" } chevron_right_question = { "?>" } chevron_right = _{ chevron_right_normal | chevron_right_closed | chevron_right_question } /// other equal = @{ "=" } quote = @{ "\"" | "'" }