// Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
mod foreach_html5lib_test;
use foreach_html5lib_test::foreach_html5lib_test;
use html5ever::tendril::*;
use html5ever::tokenizer::states::{
CdataSection, Data, Plaintext, RawData, Rawtext, Rcdata, ScriptData,
};
use html5ever::tokenizer::BufferQueue;
use html5ever::tokenizer::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
use html5ever::tokenizer::{CommentToken, DoctypeToken, TagToken, Token};
use html5ever::tokenizer::{Doctype, EndTag, StartTag, Tag};
use html5ever::tokenizer::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
use html5ever::{namespace_url, ns, Attribute, LocalName, QualName};
use serde_json::{Map, Value};
use std::cell::RefCell;
use std::ffi::OsStr;
use std::fs::File;
use std::io::Read;
use std::path::Path;
use std::{char, env};
use util::runner::Test;
mod util {
pub mod runner;
}
#[derive(Debug)]
struct TestError;
impl PartialEq for TestError {
fn eq(&self, _: &TestError) -> bool {
// TODO: actually match exact error messages
true
}
}
// some large testcases hang forever without an upper-bound of splits to generate
const MAX_SPLITS: usize = 1000;
// Return all ways of splitting the string into at most n
// possibly-empty pieces.
fn splits(s: &str, n: usize) -> Vec> {
if n == 1 {
return vec![vec![s.to_tendril()]];
}
let mut out = vec![];
for p in s.char_indices().map(|(n, _)| n).chain(Some(s.len())) {
let y = &s[p..];
for mut x in splits(&s[..p], n - 1).into_iter() {
x.push(y.to_tendril());
out.push(x);
}
}
out.extend(splits(s, n - 1));
out.truncate(MAX_SPLITS);
out
}
struct TokenLogger {
tokens: RefCell>,
errors: RefCell>,
current_str: RefCell,
exact_errors: bool,
}
impl TokenLogger {
fn new(exact_errors: bool) -> TokenLogger {
TokenLogger {
tokens: RefCell::new(vec![]),
errors: RefCell::new(vec![]),
current_str: RefCell::new(StrTendril::new()),
exact_errors,
}
}
// Push anything other than character tokens
fn push(&self, token: Token) {
self.finish_str();
self.tokens.borrow_mut().push(token);
}
fn finish_str(&self) {
if self.current_str.borrow().len() > 0 {
let s = self.current_str.take();
self.tokens.borrow_mut().push(CharacterTokens(s));
}
}
fn get_tokens(self) -> (Vec, Vec) {
self.finish_str();
(self.tokens.take(), self.errors.take())
}
}
impl TokenSink for TokenLogger {
type Handle = ();
fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
match token {
CharacterTokens(b) => {
self.current_str.borrow_mut().push_slice(&b);
},
NullCharacterToken => {
self.current_str.borrow_mut().push_char('\0');
},
ParseError(_) => {
if self.exact_errors {
self.errors.borrow_mut().push(TestError);
}
},
TagToken(mut t) => {
// The spec seems to indicate that one can emit
// erroneous end tags with attrs, but the test
// cases don't contain them.
match t.kind {
EndTag => {
t.self_closing = false;
t.attrs = vec![];
},
_ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
}
self.push(TagToken(t));
},
EOFToken => (),
_ => self.push(token),
}
TokenSinkResult::Continue
}
}
fn tokenize(input: Vec, opts: TokenizerOpts) -> (Vec, Vec) {
let sink = TokenLogger::new(opts.exact_errors);
let tok = Tokenizer::new(sink, opts);
let buffer = BufferQueue::default();
for chunk in input.into_iter() {
buffer.push_back(chunk);
let _ = tok.feed(&buffer);
}
let _ = tok.feed(&buffer);
tok.end();
tok.sink.get_tokens()
}
trait JsonExt: Sized {
fn get_str(&self) -> String;
fn get_tendril(&self) -> StrTendril;
fn get_nullable_tendril(&self) -> Option;
fn get_bool(&self) -> bool;
fn get_obj(&self) -> ⤅
fn get_list(&self) -> &Vec;
fn find(&self, key: &str) -> &Self;
}
impl JsonExt for Value {
fn get_str(&self) -> String {
match *self {
Value::String(ref s) => s.to_string(),
_ => panic!("Value::get_str: not a String"),
}
}
fn get_tendril(&self) -> StrTendril {
match *self {
Value::String(ref s) => s.to_tendril(),
_ => panic!("Value::get_tendril: not a String"),
}
}
fn get_nullable_tendril(&self) -> Option {
match *self {
Value::Null => None,
Value::String(ref s) => Some(s.to_tendril()),
_ => panic!("Value::get_nullable_tendril: not a String"),
}
}
fn get_bool(&self) -> bool {
match *self {
Value::Bool(b) => b,
_ => panic!("Value::get_bool: not a Bool"),
}
}
fn get_obj(&self) -> &Map {
match self {
Value::Object(m) => m,
_ => panic!("Value::get_obj: not an Object"),
}
}
fn get_list(&self) -> &Vec {
match self {
Value::Array(m) => m,
_ => panic!("Value::get_list: not an Array"),
}
}
fn find(&self, key: &str) -> &Value {
self.get_obj().get(key).unwrap()
}
}
// Parse a JSON object (other than "ParseError") to a token.
fn json_to_token(js: &Value) -> Token {
let parts = js.get_list();
// Collect refs here so we don't have to use "ref" in all the patterns below.
let args: Vec<&Value> = parts[1..].iter().collect();
match &*parts[0].get_str() {
"DOCTYPE" => DoctypeToken(Doctype {
name: args[0].get_nullable_tendril(),
public_id: args[1].get_nullable_tendril(),
system_id: args[2].get_nullable_tendril(),
force_quirks: !args[3].get_bool(),
}),
"StartTag" => TagToken(Tag {
kind: StartTag,
name: LocalName::from(&*args[0].get_str()),
attrs: args[1]
.get_obj()
.iter()
.map(|(k, v)| Attribute {
name: QualName::new(None, ns!(), LocalName::from(&**k)),
value: v.get_tendril(),
})
.collect(),
self_closing: match args.get(2) {
Some(b) => b.get_bool(),
None => false,
},
}),
"EndTag" => TagToken(Tag {
kind: EndTag,
name: LocalName::from(&*args[0].get_str()),
attrs: vec![],
self_closing: false,
}),
"Comment" => CommentToken(args[0].get_tendril()),
"Character" => CharacterTokens(args[0].get_tendril()),
// We don't need to produce NullCharacterToken because
// the TokenLogger will convert them to CharacterTokens.
_ => panic!("don't understand token {:?}", parts),
}
}
// Parse the "output" field of the test case into a vector of tokens.
fn json_to_tokens(
js_tokens: &Value,
js_errors: &[Value],
exact_errors: bool,
) -> (Vec, Vec) {
// Use a TokenLogger so that we combine character tokens separated
// by an ignored error.
let sink = TokenLogger::new(exact_errors);
for tok in js_tokens.get_list().iter() {
assert_eq!(
sink.process_token(json_to_token(tok), 0),
TokenSinkResult::Continue
);
}
for err in js_errors {
assert_eq!(
sink.process_token(ParseError(err.find("code").get_str().into()), 0),
TokenSinkResult::Continue
);
}
sink.get_tokens()
}
// Undo the escaping in "doubleEscaped" tests.
fn unescape(s: &str) -> Option {
let mut out = String::with_capacity(s.len());
let mut it = s.chars().peekable();
loop {
match it.next() {
None => return Some(out),
Some('\\') => {
if it.peek() != Some(&'u') {
panic!("can't understand escape");
}
let _ = it.next();
let hex: String = it.by_ref().take(4).collect();
match u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) {
// Some of the tests use lone surrogates, but we have no
// way to represent them in the UTF-8 input to our parser.
// Since these can only come from script, we will catch
// them there.
None => return None,
Some(c) => out.push(c),
}
},
Some(c) => out.push(c),
}
}
}
fn unescape_json(js: &Value) -> Value {
match js {
// unwrap is OK here because the spec'd *output* of the tokenizer never
// contains a lone surrogate.
Value::String(s) => Value::String(unescape(s).unwrap()),
Value::Array(xs) => Value::Array(xs.iter().map(unescape_json).collect()),
Value::Object(obj) => {
let mut new_obj = Map::new();
for (k, v) in obj.iter() {
new_obj.insert(k.clone(), unescape_json(v));
}
Value::Object(new_obj)
},
_ => js.clone(),
}
}
fn mk_test(
desc: String,
input: String,
expect: Value,
expect_errors: Vec,
opts: TokenizerOpts,
) -> Test {
Test {
name: desc,
skip: false,
test: Box::new(move || {
// Split up the input at different points to test incremental tokenization.
let insplits = splits(&input, 3);
for input in insplits.into_iter() {
// Clone 'input' so we have it for the failure message.
// Also clone opts. If we don't, we get the wrong
// result but the compiler doesn't catch it!
// Possibly mozilla/rust#12223.
let output = tokenize(input.clone(), opts.clone());
let expect_toks = json_to_tokens(&expect, &expect_errors, opts.exact_errors);
if output != expect_toks {
panic!(
"\ninput: {:?}\ngot: {:?}\nexpected: {:?}",
input, output, expect_toks
);
}
}
}),
}
}
fn mk_tests(tests: &mut Vec, filename: &str, js: &Value) {
let obj = js.get_obj();
let mut input = js.find("input").get_str();
let mut expect = js.find("output").clone();
let expect_errors = js
.get("errors")
.map(JsonExt::get_list)
.map(Vec::as_slice)
.unwrap_or_default();
let desc = format!("tok: {}: {}", filename, js.find("description").get_str());
// "Double-escaped" tests require additional processing of
// the input and output.
if obj
.get(&"doubleEscaped".to_string())
.map_or(false, |j| j.get_bool())
{
match unescape(&input) {
None => return,
Some(i) => input = i,
}
expect = unescape_json(&expect);
}
// Some tests have a last start tag name.
let start_tag = obj.get(&"lastStartTag".to_string()).map(|s| s.get_str());
// Some tests want to start in a state other than Data.
let state_overrides = match obj.get(&"initialStates".to_string()) {
Some(Value::Array(xs)) => xs
.iter()
.map(|s| {
Some(match &s.get_str()[..] {
"PLAINTEXT state" => Plaintext,
"RAWTEXT state" => RawData(Rawtext),
"RCDATA state" => RawData(Rcdata),
"Script data state" => RawData(ScriptData),
"CDATA section state" => CdataSection,
"Data state" => Data,
s => panic!("don't know state {}", s),
})
})
.collect(),
None => vec![None],
_ => panic!("don't understand initialStates value"),
};
// Build the tests.
for state in state_overrides.into_iter() {
for &exact_errors in [false, true].iter() {
let mut newdesc = desc.clone();
if let Some(s) = state {
newdesc = format!("{} (in state {:?})", newdesc, s)
};
if exact_errors {
newdesc = format!("{} (exact errors)", newdesc);
}
tests.push(mk_test(
newdesc,
input.clone(),
expect.clone(),
expect_errors.to_owned(),
TokenizerOpts {
exact_errors,
initial_state: state,
last_start_tag_name: start_tag.clone(),
// Not discarding a BOM is what the test suite expects; see
// https://github.com/html5lib/html5lib-tests/issues/2
discard_bom: false,
..Default::default()
},
));
}
}
}
fn tests(src_dir: &Path) -> Vec {
let mut tests = vec![];
let mut add_test = |path: &Path, mut file: File| {
let mut s = String::new();
file.read_to_string(&mut s).expect("file reading error");
let js: Value = serde_json::from_str(&s).expect("json parse error");
if let Some(Value::Array(lst)) = js.get_obj().get("tests") {
for test in lst.iter() {
mk_tests(
&mut tests,
path.file_name().unwrap().to_str().unwrap(),
test,
)
}
}
};
foreach_html5lib_test(
src_dir,
"html5lib-tests/tokenizer",
OsStr::new("test"),
&mut add_test,
);
foreach_html5lib_test(
src_dir,
"custom-html5lib-tokenizer-tests",
OsStr::new("test"),
&mut add_test,
);
tests
}
fn main() {
for test in tests(Path::new(env!("CARGO_MANIFEST_DIR"))) {
test.run();
}
}