#[macro_use] extern crate html5ever;
#[macro_use] extern crate lazy_static;
use html5ever::{driver as html, QualName};
use html5ever::rcdom::{Handle, NodeData, RcDom};
use html5ever::serialize::{serialize, SerializeOpts};
use pulldown_cmark_fork::{Parser, Options};
use std::collections::HashSet;
use std::mem;
use std::rc::{Rc, Weak};
use tendril::stream::TendrilSink;
use regex::Regex;
mod suite;
#[inline(never)]
pub fn test_markdown_html(input: &str, output: &str) {
let mut s = String::new();
let mut opts = Options::empty();
opts.insert(Options::ENABLE_TABLES);
opts.insert(Options::ENABLE_FOOTNOTES);
opts.insert(Options::ENABLE_STRIKETHROUGH);
opts.insert(Options::ENABLE_TASKLISTS);
let p = Parser::new_ext(input, opts);
pulldown_cmark_fork::html::push_html(&mut s, p);
assert_eq!(normalize_html(output), normalize_html(&s));
}
lazy_static! {
static ref WHITESPACE_RE: Regex = Regex::new(r"\s+").unwrap();
static ref LEADING_WHITESPACE_RE: Regex = Regex::new(r"\A\s+").unwrap();
static ref TRAILING_WHITESPACE_RE: Regex = Regex::new(r"\s+\z").unwrap();
static ref BLOCK_TAGS: HashSet<&'static str> = [
"article", "header", "aside", "hgroup", "blockquote", "hr", "iframe", "body", "li",
"map", "button", "object", "canvas", "ol", "caption", "output", "col", "p", "colgroup",
"pre", "dd", "progress", "div", "section", "dl", "table", "td", "dt", "tbody", "embed",
"textarea", "fieldset", "tfoot", "figcaption", "th", "figure", "thead", "footer", "tr",
"form", "ul", "h1", "h2", "h3", "h4", "h5", "h6", "video", "script", "style"
].iter().cloned().collect();
static ref PRE_TAGS: HashSet<&'static str> = [
"pre", "code"
].iter().cloned().collect();
static ref TABLE_TAGS: HashSet<&'static str> = [
"table", "thead", "tbody", "tr", "td"
].iter().cloned().collect();
}
fn make_html_parser() -> html::Parser {
html::parse_fragment(
RcDom::default(),
html::ParseOpts::default(),
QualName::new(None, ns!(html), local_name!("div")),
vec![],
)
}
fn normalize_html(s: &str) -> String {
let parser = make_html_parser();
let dom = parser.one(s);
let body = normalize_dom(&dom);
let opts = SerializeOpts::default();
let mut ret_val = Vec::new();
serialize(&mut ret_val, &body, opts)
.expect("Writing to a string shouldn't fail (expect on OOM)");
String::from_utf8(ret_val)
.expect("html5ever should always produce UTF8")
}
fn normalize_dom(dom: &RcDom) -> Handle {
let body = {
let children = dom.document.children.borrow();
children[0].clone()
};
let mut current_level = Vec::new();
let mut next_level = Vec::new();
current_level.extend(
body.children.borrow()
.iter()
.cloned()
.rev()
);
loop {
while let Some(mut node) = current_level.pop() {
let parent = node.parent.replace(None);
node.parent.replace(parent.clone());
let parent = parent
.expect("a node in the DOM will have a parent, except the root, which is not processed")
.upgrade().expect("a node's parent will be pointed to by its parent (or the root pointer), and will not be dropped");
let retain = normalize_node(&parent, &mut node);
if !retain {
let mut siblings = parent.children.borrow_mut();
siblings.retain(|s| !Rc::ptr_eq(&node, s));
} else {
next_level.extend(
node.children.borrow()
.iter()
.cloned()
.rev(),
);
}
}
if next_level.is_empty() { break };
mem::swap(&mut next_level, &mut current_level);
}
body
}
// Returns false if node is an empty text node or an empty tbody.
// Returns true otherwise.
fn normalize_node(parent: &Handle, node: &mut Handle) -> bool {
match node.data {
NodeData::Comment { .. } |
NodeData::Doctype { .. } |
NodeData::Document |
NodeData::ProcessingInstruction { .. } => true,
NodeData::Text { ref contents, .. } => {
let mut contents = contents.borrow_mut();
let is_pre = {
let mut parent = parent.clone();
loop {
let is_pre = if let NodeData::Element{ ref name, .. } = parent.data {
PRE_TAGS.contains(&&*name.local.to_ascii_lowercase())
} else {
false
};
if is_pre { break true };
let parent_ = parent.parent.replace(None);
parent.parent.replace(parent_.clone());
let parent_ = parent_.as_ref().and_then(Weak::upgrade);
if let Some(parent_) = parent_ {
parent = parent_
} else {
break false
};
}
};
if !is_pre {
let (is_first_in_block, is_last_in_block) = {
let mut is_first_in_block = true;
let mut is_last_in_block = true;
let mut parent = parent.clone();
let mut node = node.clone();
loop {
let reached_block = if let NodeData::Element{ ref name, .. } = parent.data {
BLOCK_TAGS.contains(&&*name.local.to_ascii_lowercase())
} else {
false
};
let (is_first, is_last) = {
let siblings = parent.children.borrow();
let n = &node;
(siblings.get(0).map(|s| Rc::ptr_eq(s, n)).unwrap_or(false), siblings.len() > 0 && siblings.get(siblings.len() - 1).map(|s| Rc::ptr_eq(s, n)).unwrap_or(false))
};
is_first_in_block = is_first_in_block && is_first;
is_last_in_block = is_last_in_block && is_last;
if (is_first_in_block || is_last_in_block) && !reached_block {
node = parent.clone();
let parent_ = parent.parent.replace(None);
parent.parent.replace(parent_.clone());
let parent_ = parent_.as_ref().and_then(Weak::upgrade);
if let Some(parent_) = parent_ {
parent = parent_;
} else {
break (is_first_in_block, is_last_in_block)
}
} else {
break (is_first_in_block, is_last_in_block)
}
}
};
let is_preceeded_by_ws = {
let mut parent = parent.clone();
let mut node = node.clone();
'ascent: loop {
let is_first = {
let siblings = parent.children.borrow();
let n = &node;
siblings.get(0).map(|s| Rc::ptr_eq(s, n)).unwrap_or(false)
};
if is_first {
node = parent.clone();
let parent_ = parent.parent.replace(None);
parent.parent.replace(parent_.clone());
let parent_ = parent_.as_ref().and_then(Weak::upgrade);
if let Some(parent_) = parent_ {
parent = parent_;
} else {
break 'ascent false
}
} else {
let siblings = parent.children.borrow();
let n = &node;
let mut pos = !0;
'search: for (i, s) in siblings.iter().enumerate() {
if Rc::ptr_eq(s, n) {
pos = i;
break 'search;
}
}
assert!(pos != !0, "The list of node's parent's children shall contain node");
assert!(pos != 0, "If node is not first, then node's position shall not be zero");
let mut preceeding = siblings[pos-1].clone();
'descent: loop {
if let NodeData::Text { .. } = preceeding.data {
break 'descent
}
preceeding = {
let ch = preceeding.children.borrow();
if ch.len() == 0 { break 'descent }
if let Some(preceeding_) = ch.get(ch.len() - 1) {
preceeding_.clone()
} else {
break 'descent
}
};
}
if let NodeData::Text { ref contents, .. } = preceeding.data {
break 'ascent TRAILING_WHITESPACE_RE.is_match(&*contents.borrow())
} else {
break 'ascent false
}
}
}
};
let is_in_table = if let NodeData::Element{ ref name, .. } = parent.data {
TABLE_TAGS.contains(&&*name.local.to_ascii_lowercase())
} else {
false
};
let whitespace_replacement = if is_in_table {
""
} else {
" "
};
*contents = WHITESPACE_RE.replace_all(&*contents, whitespace_replacement).as_ref().into();
if is_first_in_block || is_preceeded_by_ws {
*contents = LEADING_WHITESPACE_RE.replace_all(&*contents, "").as_ref().into();
}
if is_last_in_block {
*contents = TRAILING_WHITESPACE_RE.replace_all(&*contents, "").as_ref().into();
}
// TODO: collapse whitespace when adjacent to whitespace.
// For example, the whitespace in the span should be collapsed in all of these cases:
//
// " q "
// "q q"
// "q q"
// "q q"
// "q q"
}
&**contents != ""
}
NodeData::Element { ref attrs, ref name, .. } => {
let mut attrs = attrs.borrow_mut();
for a in attrs.iter_mut() {
a.name.local = a.name.local.to_ascii_lowercase().into();
}
attrs.sort_by(|a: &html5ever::Attribute, b: &html5ever::Attribute| {
(&*a.name.local).cmp(&*b.name.local)
});
let ascii_name = &*name.local.to_ascii_lowercase();
// drop empty tbody's
ascii_name != "tbody" ||
node.children.borrow().len() > 1 ||
node.children.borrow().iter().next().map(|only_child| match only_child.data {
NodeData::Text { ref contents, .. } => {
!contents.borrow().chars().all(|c| c.is_whitespace())
}
_ => {
true
}
}).unwrap_or(false)
}
}
}
#[test]
fn strip_div_newline() {
assert_eq!("", normalize_html("\n
"));
}
#[test]
fn strip_end_newline() {
assert_eq!("test", normalize_html("test\n"));
}
#[test]
fn strip_double_space() {
assert_eq!("test mess", normalize_html("test mess"));
}
#[test]
fn strip_inline_internal_text() {
assert_eq!("a b c", normalize_html(" a b c "))
}
#[test]
fn strip_inline_block_internal_text() {
assert_eq!("a b c", normalize_html(" a b c "))
}
#[test]
fn leaves_necessary_whitespace_alone() {
assert_eq!("a b c", normalize_html("a b c"))
}
#[test]
fn leaves_necessary_whitespace_alone_weird() {
assert_eq!("a b c", normalize_html(" a b c"))
}
#[test]
fn leaves_necessary_whitespace_all_nested() {
assert_eq!("", normalize_html(" "))
}
#[test]
fn drops_empty_tbody() {
assert_eq!(
"",
normalize_html("")
)
}
#[test]
fn leaves_nonempty_tbody() {
let input = "";
assert_eq!(input, normalize_html(input))
}