// Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 or the MIT license
// , at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use serde_json::{Map, Value};
use std::borrow::Cow::Borrowed;
use std::env;
use std::ffi::OsStr;
use std::io::Read;
use std::mem::replace;
use std::path::Path;
use rustc_test::{DynTestFn, DynTestName, TestDesc, TestDescAndFn};
use util::find_tests::foreach_xml5lib_test;
use markup5ever::buffer_queue::BufferQueue;
use xml5ever::tendril::{SliceExt, StrTendril};
use xml5ever::tokenizer::{CharacterTokens, Token, TokenSink};
use xml5ever::tokenizer::{CommentToken, EmptyTag, EndTag, ShortTag, StartTag, Tag};
use xml5ever::tokenizer::{Doctype, DoctypeToken, PIToken, Pi};
use xml5ever::tokenizer::{EOFToken, XmlTokenizer, XmlTokenizerOpts};
use xml5ever::tokenizer::{NullCharacterToken, ParseError, TagToken};
use xml5ever::{namespace_url, ns, Attribute, LocalName, QualName};
mod util {
pub mod find_tests;
}
// Return all ways of splitting the string into at most n
// possibly-empty pieces.
fn splits(s: &str, n: usize) -> Vec> {
if n == 1 {
return vec![vec![s.to_tendril()]];
}
let mut points: Vec = s.char_indices().map(|(n, _)| n).collect();
points.push(s.len());
// do this with iterators?
let mut out = vec![];
for p in points.into_iter() {
let y = &s[p..];
for mut x in splits(&s[..p], n - 1).into_iter() {
x.push(y.to_tendril());
out.push(x);
}
}
out.extend(splits(s, n - 1).into_iter());
out
}
struct TokenLogger {
tokens: Vec,
current_str: StrTendril,
exact_errors: bool,
}
impl TokenLogger {
fn new(exact_errors: bool) -> TokenLogger {
TokenLogger {
tokens: vec![],
current_str: StrTendril::new(),
exact_errors: exact_errors,
}
}
// Push anything other than character tokens
fn push(&mut self, token: Token) {
self.finish_str();
self.tokens.push(token);
}
fn finish_str(&mut self) {
if self.current_str.len() > 0 {
let s = replace(&mut self.current_str, StrTendril::new());
self.tokens.push(CharacterTokens(s));
}
}
fn get_tokens(mut self) -> Vec {
self.finish_str();
self.tokens
}
}
impl TokenSink for TokenLogger {
fn process_token(&mut self, token: Token) {
match token {
CharacterTokens(b) => {
self.current_str.push_slice(&b);
}
NullCharacterToken => {
self.current_str.push_char('\0');
}
ParseError(_) => {
if self.exact_errors {
self.push(ParseError(Borrowed("")));
}
}
TagToken(mut t) => {
// The spec seems to indicate that one can emit
// erroneous end tags with attrs, but the test
// cases don't contain them.
match t.kind {
EndTag => {
t.attrs = vec![];
}
_ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
}
self.push(TagToken(t));
}
EOFToken => (),
_ => self.push(token),
}
}
}
fn tokenize_xml(input: Vec, opts: XmlTokenizerOpts) -> Vec {
let sink = TokenLogger::new(opts.exact_errors);
let mut tok = XmlTokenizer::new(sink, opts);
let mut buf = BufferQueue::new();
for chunk in input.into_iter() {
buf.push_back(chunk);
let _ = tok.feed(&mut buf);
}
let _ = tok.feed(&mut buf);
tok.end();
tok.sink.get_tokens()
}
trait JsonExt: Sized {
fn get_str(&self) -> String;
fn get_tendril(&self) -> StrTendril;
fn get_nullable_tendril(&self) -> Option;
fn get_bool(&self) -> bool;
fn get_obj<'t>(&'t self) -> &'t Map;
fn get_list<'t>(&'t self) -> &'t Vec;
fn find<'t>(&'t self, key: &str) -> &'t Self;
}
impl JsonExt for Value {
fn get_str(&self) -> String {
match *self {
Value::String(ref s) => s.to_string(),
_ => panic!("Value::get_str: not a String"),
}
}
fn get_tendril(&self) -> StrTendril {
match *self {
Value::String(ref s) => s.to_tendril(),
_ => panic!("Value::get_tendril: not a String"),
}
}
fn get_nullable_tendril(&self) -> Option {
match *self {
Value::Null => None,
Value::String(ref s) => Some(s.to_tendril()),
_ => panic!("Value::get_nullable_tendril: not a String"),
}
}
fn get_bool(&self) -> bool {
match *self {
Value::Bool(b) => b,
_ => panic!("Value::get_bool: not a Boolean"),
}
}
fn get_obj<'t>(&'t self) -> &'t Map {
match *self {
Value::Object(ref m) => &*m,
_ => panic!("Value::get_obj: not an Object"),
}
}
fn get_list<'t>(&'t self) -> &'t Vec {
match *self {
Value::Array(ref m) => m,
_ => panic!("Value::get_list: not an Array"),
}
}
fn find<'t>(&'t self, key: &str) -> &'t Value {
self.get_obj().get(&key.to_string()).unwrap()
}
}
// Parse a JSON object (other than "ParseError") to a token.
fn json_to_token(js: &Value) -> Token {
let parts = js.as_array().unwrap();
// Collect refs here so we don't have to use "ref" in all the patterns below.
let args: Vec<&Value> = parts[1..].iter().collect();
match &*parts[0].get_str() {
"StartTag" => TagToken(Tag {
kind: StartTag,
name: QualName::new(None, ns!(), LocalName::from(args[0].get_str())),
attrs: args[1]
.get_obj()
.iter()
.map(|(k, v)| Attribute {
name: QualName::new(None, ns!(), LocalName::from(&**k)),
value: v.get_tendril(),
})
.collect(),
}),
"EndTag" => TagToken(Tag {
kind: EndTag,
name: QualName::new(None, ns!(), LocalName::from(args[0].get_str())),
attrs: vec![],
}),
"ShortTag" => TagToken(Tag {
kind: ShortTag,
name: QualName::new(None, ns!(), LocalName::from(args[0].get_str())),
attrs: vec![],
}),
"EmptyTag" => TagToken(Tag {
kind: EmptyTag,
name: QualName::new(None, ns!(), LocalName::from(args[0].get_str())),
attrs: args[1]
.get_obj()
.iter()
.map(|(k, v)| Attribute {
name: QualName::new(None, ns!(), LocalName::from(&**k)),
value: v.get_tendril(),
})
.collect(),
}),
"Comment" => CommentToken(args[0].get_tendril()),
"Character" => CharacterTokens(args[0].get_tendril()),
"PI" => PIToken(Pi {
target: args[0].get_tendril(),
data: args[1].get_tendril(),
}),
"DOCTYPE" => DoctypeToken(Doctype {
name: args[0].get_nullable_tendril(),
public_id: args[1].get_nullable_tendril(),
system_id: args[2].get_nullable_tendril(),
}),
// We don't need to produce NullCharacterToken because
// the TokenLogger will convert them to CharacterTokens.
_ => panic!("don't understand token {:?}", parts),
}
}
// Parse the "output" field of the test case into a vector of tokens.
fn json_to_tokens(js: &Value, exact_errors: bool) -> Vec {
// Use a TokenLogger so that we combine character tokens separated
// by an ignored error.
let mut sink = TokenLogger::new(exact_errors);
for tok in js.as_array().unwrap().iter() {
match *tok {
Value::String(ref s) if &s[..] == "ParseError" => {
sink.process_token(ParseError(Borrowed("")))
}
_ => sink.process_token(json_to_token(tok)),
}
}
sink.get_tokens()
}
fn mk_xml_test(
desc: String,
input: String,
expect: Value,
opts: XmlTokenizerOpts,
) -> TestDescAndFn {
TestDescAndFn {
desc: TestDesc::new(DynTestName(desc)),
testfn: DynTestFn(Box::new(move || {
// Split up the input at different points to test incremental tokenization.
let insplits = splits(&input, 3);
for input in insplits.into_iter() {
// Clone 'input' so we have it for the failure message.
// Also clone opts. If we don't, we get the wrong
// result but the compiler doesn't catch it!
// Possibly mozilla/rust#12223.
let output = tokenize_xml(input.clone(), opts.clone());
let expect = json_to_tokens(&expect, opts.exact_errors);
if output != expect {
panic!(
"\ninput: {:?}\ngot: {:?}\nexpected: {:?}",
input, output, expect
);
}
}
})),
}
}
fn mk_xml_tests(tests: &mut Vec, filename: &str, js: &Value) {
let input: &str = &js.find("input").get_str();
let expect = js.find("output");
let desc = format!("tok: {}: {}", filename, js.find("description").get_str());
// Some tests want to start in a state other than Data.
let state_overrides = vec![None];
// Build the tests.
for state in state_overrides.into_iter() {
for &exact_errors in [false, true].iter() {
let mut newdesc = desc.clone();
match state {
Some(s) => newdesc = format!("{} (in state {:?})", newdesc, s),
None => (),
};
if exact_errors {
newdesc = format!("{} (exact errors)", newdesc);
}
tests.push(mk_xml_test(
newdesc,
String::from(input),
expect.clone(),
XmlTokenizerOpts {
exact_errors: exact_errors,
initial_state: state,
// Not discarding a BOM is what the test suite expects; see
// https://github.com/html5lib/html5lib-tests/issues/2
discard_bom: false,
..Default::default()
},
));
}
}
}
fn tests(src_dir: &Path) -> Vec {
let mut tests = vec![];
foreach_xml5lib_test(
src_dir,
"tokenizer",
OsStr::new("test"),
|path, mut file| {
let mut s = String::new();
file.read_to_string(&mut s)
.ok()
.expect("file reading error");
let js: Value = serde_json::from_str(&s).ok().expect("json parse error");
match js["tests"] {
Value::Array(ref lst) => {
for test in lst.iter() {
mk_xml_tests(
&mut tests,
path.file_name().unwrap().to_str().unwrap(),
test,
);
}
}
_ => (),
}
},
);
tests
}
fn main() {
let args: Vec<_> = env::args().collect();
rustc_test::test_main(&args, tests(Path::new(env!("CARGO_MANIFEST_DIR"))));
}