// Copyright 2014-2017 The html5ever Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. extern crate html5ever; extern crate typed_arena; use html5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; use html5ever::tendril::{StrTendril, TendrilSink}; use html5ever::{parse_document, Attribute, QualName}; use std::borrow::Cow; use std::cell::{Cell, RefCell}; use std::collections::HashSet; use std::io::{self, Read}; use std::ptr; /// By using our Sink type, the arena is filled with parsed HTML. fn html5ever_parse_slice_into_arena<'a>(bytes: &[u8], arena: Arena<'a>) -> Ref<'a> { let sink = Sink { arena, document: arena.alloc(Node::new(NodeData::Document)), quirks_mode: Cell::new(QuirksMode::NoQuirks), }; parse_document(sink, Default::default()) .from_utf8() .one(bytes) } type Arena<'arena> = &'arena typed_arena::Arena>; type Ref<'arena> = &'arena Node<'arena>; type Link<'arena> = Cell>>; /// Sink struct is responsible for handling how the data that comes out of the HTML parsing /// unit (TreeBuilder in our case) is handled. struct Sink<'arena> { arena: Arena<'arena>, document: Ref<'arena>, quirks_mode: Cell, } /// DOM node which contains links to other nodes in the tree. pub struct Node<'arena> { parent: Link<'arena>, next_sibling: Link<'arena>, previous_sibling: Link<'arena>, first_child: Link<'arena>, last_child: Link<'arena>, data: NodeData<'arena>, } /// HTML node data which can be an element, a comment, a string, a DOCTYPE, etc... pub enum NodeData<'arena> { Document, Doctype { name: StrTendril, public_id: StrTendril, system_id: StrTendril, }, Text { contents: RefCell, }, Comment { contents: StrTendril, }, Element { name: QualName, attrs: RefCell>, template_contents: Option>, mathml_annotation_xml_integration_point: bool, }, ProcessingInstruction { target: StrTendril, contents: StrTendril, }, } impl<'arena> Node<'arena> { fn new(data: NodeData<'arena>) -> Self { Node { parent: Cell::new(None), previous_sibling: Cell::new(None), next_sibling: Cell::new(None), first_child: Cell::new(None), last_child: Cell::new(None), data, } } fn detach(&self) { let parent = self.parent.take(); let previous_sibling = self.previous_sibling.take(); let next_sibling = self.next_sibling.take(); if let Some(next_sibling) = next_sibling { next_sibling.previous_sibling.set(previous_sibling); } else if let Some(parent) = parent { parent.last_child.set(previous_sibling); } if let Some(previous_sibling) = previous_sibling { previous_sibling.next_sibling.set(next_sibling); } else if let Some(parent) = parent { parent.first_child.set(next_sibling); } } fn append(&'arena self, new_child: &'arena Self) { new_child.detach(); new_child.parent.set(Some(self)); if let Some(last_child) = self.last_child.take() { new_child.previous_sibling.set(Some(last_child)); debug_assert!(last_child.next_sibling.get().is_none()); last_child.next_sibling.set(Some(new_child)); } else { debug_assert!(self.first_child.get().is_none()); self.first_child.set(Some(new_child)); } self.last_child.set(Some(new_child)); } fn insert_before(&'arena self, new_sibling: &'arena Self) { new_sibling.detach(); new_sibling.parent.set(self.parent.get()); new_sibling.next_sibling.set(Some(self)); if let Some(previous_sibling) = self.previous_sibling.take() { new_sibling.previous_sibling.set(Some(previous_sibling)); debug_assert!(ptr::eq::( previous_sibling.next_sibling.get().unwrap(), self )); previous_sibling.next_sibling.set(Some(new_sibling)); } else if let Some(parent) = self.parent.get() { debug_assert!(ptr::eq::(parent.first_child.get().unwrap(), self)); parent.first_child.set(Some(new_sibling)); } self.previous_sibling.set(Some(new_sibling)); } } impl<'arena> Sink<'arena> { fn new_node(&self, data: NodeData<'arena>) -> Ref<'arena> { self.arena.alloc(Node::new(data)) } fn append_common(&self, child: NodeOrText>, previous: P, append: A) where P: FnOnce() -> Option>, A: FnOnce(Ref<'arena>), { let new_node = match child { NodeOrText::AppendText(text) => { // Append to an existing Text node if we have one. if let Some(&Node { data: NodeData::Text { ref contents }, .. }) = previous() { contents.borrow_mut().push_tendril(&text); return; } self.new_node(NodeData::Text { contents: RefCell::new(text), }) }, NodeOrText::AppendNode(node) => node, }; append(new_node) } } /// By implementing the TreeSink trait we determine how the data from the tree building step /// is processed. In our case, our data is allocated in the arena and added to the Node data /// structure. /// /// For deeper understating of each function go to the TreeSink declaration. impl<'arena> TreeSink for Sink<'arena> { type Handle = Ref<'arena>; type Output = Ref<'arena>; type ElemName<'a> = &'a QualName where Self : 'a; fn finish(self) -> Ref<'arena> { self.document } fn parse_error(&self, _: Cow<'static, str>) {} fn get_document(&self) -> Ref<'arena> { self.document } fn set_quirks_mode(&self, mode: QuirksMode) { self.quirks_mode.set(mode); } fn same_node(&self, x: &Ref<'arena>, y: &Ref<'arena>) -> bool { ptr::eq::(*x, *y) } fn elem_name(&self, target: &Ref<'arena>) -> Self::ElemName<'_> { match target.data { NodeData::Element { ref name, .. } => name, _ => panic!("not an element!"), } } fn get_template_contents(&self, target: &Ref<'arena>) -> Ref<'arena> { if let NodeData::Element { template_contents: Some(contents), .. } = target.data { contents } else { panic!("not a template element!") } } fn is_mathml_annotation_xml_integration_point(&self, target: &Ref<'arena>) -> bool { if let NodeData::Element { mathml_annotation_xml_integration_point, .. } = target.data { mathml_annotation_xml_integration_point } else { panic!("not an element!") } } fn create_element( &self, name: QualName, attrs: Vec, flags: ElementFlags, ) -> Ref<'arena> { self.new_node(NodeData::Element { name, attrs: RefCell::new(attrs), template_contents: if flags.template { Some(self.new_node(NodeData::Document)) } else { None }, mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point, }) } fn create_comment(&self, text: StrTendril) -> Ref<'arena> { self.new_node(NodeData::Comment { contents: text }) } fn create_pi(&self, target: StrTendril, data: StrTendril) -> Ref<'arena> { self.new_node(NodeData::ProcessingInstruction { target, contents: data, }) } fn append(&self, parent: &Ref<'arena>, child: NodeOrText>) { self.append_common( child, || parent.last_child.get(), |new_node| parent.append(new_node), ) } fn append_before_sibling(&self, sibling: &Ref<'arena>, child: NodeOrText>) { self.append_common( child, || sibling.previous_sibling.get(), |new_node| sibling.insert_before(new_node), ) } fn append_based_on_parent_node( &self, element: &Ref<'arena>, prev_element: &Ref<'arena>, child: NodeOrText>, ) { if element.parent.get().is_some() { self.append_before_sibling(element, child) } else { self.append(prev_element, child) } } fn append_doctype_to_document( &self, name: StrTendril, public_id: StrTendril, system_id: StrTendril, ) { self.document.append(self.new_node(NodeData::Doctype { name, public_id, system_id, })) } fn add_attrs_if_missing(&self, target: &Ref<'arena>, attrs: Vec) { let mut existing = if let NodeData::Element { ref attrs, .. } = target.data { attrs.borrow_mut() } else { panic!("not an element") }; let existing_names = existing .iter() .map(|e| e.name.clone()) .collect::>(); existing.extend( attrs .into_iter() .filter(|attr| !existing_names.contains(&attr.name)), ); } fn remove_from_parent(&self, target: &Ref<'arena>) { target.detach() } fn reparent_children(&self, node: &Ref<'arena>, new_parent: &Ref<'arena>) { let mut next_child = node.first_child.get(); while let Some(child) = next_child { debug_assert!(ptr::eq::(child.parent.get().unwrap(), *node)); next_child = child.next_sibling.get(); new_parent.append(child) } } } /// In this example an "arena" is created and filled with the DOM nodes. /// "Arena" is a type of allocation in which a block of memory is allocated /// and later filled with data, DOM nodes in this case. When the arena is deallocated /// it is destroyed with all of its items. /// /// Further info about arena: https://docs.rs/typed-arena/latest/typed_arena/ fn main() { // Read HTML from the standard input let mut bytes = Vec::new(); io::stdin().read_to_end(&mut bytes).unwrap(); let arena = typed_arena::Arena::new(); html5ever_parse_slice_into_arena(&bytes, &arena); }