//! Render Markdown files to HTML. use ammonia::{Builder, UrlRelative, UrlRelativeEvaluate}; use comrak::nodes::{AstNode, NodeValue}; use htmlescape::encode_minimal; use std::borrow::Cow; use std::path::Path; use url::Url; /// Context for markdown to HTML rendering. struct MarkdownRenderer<'a> { html_sanitizer: Builder<'a>, } impl<'a> MarkdownRenderer<'a> { /// Creates a new renderer instance. /// /// Per `text_to_html`, `base_url` is the base URL prepended to any /// relative links in the input document. See that function for more detail. fn new(base_url: Option<&'a str>, base_dir: &'a str) -> MarkdownRenderer<'a> { let allowed_classes = hashmap(&[( "code", hashset(&[ "language-bash", "language-clike", "language-glsl", "language-go", "language-ini", "language-javascript", "language-json", "language-markup", "language-protobuf", "language-ruby", "language-rust", "language-scss", "language-sql", "language-toml", "language-yaml", ]), )]); let sanitize_url = UrlRelative::Custom(Box::new(SanitizeUrl::new(base_url, base_dir))); let mut html_sanitizer = Builder::default(); html_sanitizer .add_tags(&["input"]) .link_rel(Some("nofollow noopener noreferrer")) .add_generic_attributes(&["align"]) .add_tag_attributes("a", &["id", "target"]) .add_tag_attributes("input", &["checked", "disabled", "type"]) .allowed_classes(allowed_classes) .url_relative(sanitize_url) .id_prefix(Some("user-content-")); MarkdownRenderer { html_sanitizer } } /// Renders the given markdown to HTML using the current settings. fn to_html(&self, text: &str) -> String { use comrak::{ format_html, parse_document, Arena, ComrakExtensionOptions, ComrakOptions, ComrakRenderOptions, }; let options = ComrakOptions { render: ComrakRenderOptions { unsafe_: true, // The output will be sanitized with `ammonia` ..ComrakRenderOptions::default() }, extension: ComrakExtensionOptions { autolink: true, strikethrough: true, table: true, tagfilter: true, tasklist: true, header_ids: Some("user-content-".to_string()), ..ComrakExtensionOptions::default() }, ..ComrakOptions::default() }; let arena = Arena::new(); let root = parse_document(&arena, text, &options); // Tweak annotations of code blocks. iter_nodes(root, &|node| { if let NodeValue::CodeBlock(ref mut ncb) = node.data.borrow_mut().value { // If annot includes invalid UTF-8 char, do nothing. if let Ok(mut orig_annot) = String::from_utf8(ncb.info.to_vec()) { // Ignore characters after a comma for syntax highlighting to work correctly. if let Some(offset) = orig_annot.find(',') { let _ = orig_annot.drain(offset..orig_annot.len()); ncb.info = orig_annot.as_bytes().to_vec(); } } } }); let mut html = Vec::new(); format_html(root, &options, &mut html).unwrap(); let rendered = String::from_utf8(html).unwrap(); self.html_sanitizer.clean(&rendered).to_string() } } /// Iterate the nodes in the CommonMark AST, used in comrak. fn iter_nodes<'a, F>(node: &'a AstNode<'a>, f: &F) where F: Fn(&'a AstNode<'a>), { f(node); for c in node.children() { iter_nodes(c, f); } } /// Add trailing slash and remove `.git` suffix of base URL. fn canon_base_url(mut base_url: String) -> String { if !base_url.ends_with('/') { base_url.push('/'); } if base_url.ends_with(".git/") { let offset = base_url.len() - 5; base_url.drain(offset..offset + 4); } base_url } /// Sanitize relative URLs in Markdown files. struct SanitizeUrl { base_url: Option, base_dir: String, } impl SanitizeUrl { fn new(base_url: Option<&str>, base_dir: &str) -> Self { let base_url = base_url .and_then(|base_url| Url::parse(base_url).ok()) .and_then(|url| match url.host_str() { Some("github.com") | Some("gitlab.com") | Some("bitbucket.org") => { Some(canon_base_url(url.into())) } _ => None, }); Self { base_url, base_dir: base_dir.to_owned(), } } } /// Groups media-related URL info struct MediaUrl { is_media: bool, add_sanitize_query: bool, } /// Determine whether the given URL has a media file extension. /// Also check if `sanitize=true` must be added to the query string, /// which is required to load SVGs properly from GitHub. fn is_media_url(url: &str) -> MediaUrl { Path::new(url) .extension() .and_then(std::ffi::OsStr::to_str) .map_or( MediaUrl { is_media: false, add_sanitize_query: false, }, |e| match e { "svg" => MediaUrl { is_media: true, add_sanitize_query: true, }, "png" | "jpg" | "jpeg" | "gif" | "mp4" | "webm" | "ogg" | "webp" => MediaUrl { is_media: true, add_sanitize_query: false, }, _ => MediaUrl { is_media: false, add_sanitize_query: false, }, }, ) } impl UrlRelativeEvaluate for SanitizeUrl { fn evaluate<'a>(&self, url: &'a str) -> Option> { if url.starts_with('#') { // Always allow fragment URLs. return Some(Cow::Borrowed(url)); } self.base_url.as_ref().map(|base_url| { let mut new_url = base_url.clone(); // Assumes GitHub’s URL scheme. GitHub renders text and markdown // better in the "blob" view, but images need to be served raw. let MediaUrl { is_media, add_sanitize_query, } = is_media_url(url); new_url += if is_media { "raw/HEAD" } else { "blob/HEAD" }; if !self.base_dir.is_empty() { new_url += "/"; new_url += &self.base_dir; } if !url.starts_with('/') { new_url.push('/'); } new_url += url; if add_sanitize_query { if let Ok(mut parsed_url) = Url::parse(&new_url) { parsed_url.query_pairs_mut().append_pair("sanitize", "true"); new_url = parsed_url.into(); } } Cow::Owned(new_url) }) } } /// Renders Markdown text to sanitized HTML with a given `base_url`. /// See `text_to_html` for the interpretation of `base_url`. fn markdown_to_html(text: &str, base_url: Option<&str>, base_dir: &str) -> String { let renderer = MarkdownRenderer::new(base_url, base_dir); renderer.to_html(text) } /// Any file with a filename ending in one of these extensions will be rendered as Markdown. /// Note we also render a file as Markdown if _no_ extension is on the filename. static MARKDOWN_EXTENSIONS: [&str; 7] = ["md", "markdown", "mdown", "mdwn", "mkd", "mkdn", "mkdown"]; /// Renders a text file to sanitized HTML. An appropriate rendering method is chosen depending /// on the extension of the supplied `filename`. /// /// The returned text will not contain any harmful HTML tag or attribute (such as iframe, /// onclick, onmouseover, etc.). /// /// The `base_url` parameter will be used as the base for any relative links found in the /// Markdown, as long as its host part is github.com, gitlab.com, or bitbucket.org. The /// supplied URL will be used as a directory base whether or not the relative link is /// prefixed with '/'. If `None` is passed, relative links will be omitted. /// /// # Examples /// /// ``` /// use cargo_registry_markdown::text_to_html; /// /// let text = "[Rust](https://rust-lang.org/) is an awesome *systems programming* language!"; /// let rendered = text_to_html(text, "README.md", None, None); /// assert_eq!(rendered, "

Rust is an awesome systems programming language!

\n"); /// ``` pub fn text_to_html( text: &str, readme_path_in_pkg: &str, base_url: Option<&str>, pkg_path_in_vcs: Option<&str>, ) -> String { let path_in_vcs = Path::new(pkg_path_in_vcs.unwrap_or("")).join(readme_path_in_pkg); let base_dir = path_in_vcs.parent().and_then(|p| p.to_str()).unwrap_or(""); if path_in_vcs.extension().is_none() { return markdown_to_html(text, base_url, base_dir); } if let Some(ext) = path_in_vcs.extension().and_then(|ext| ext.to_str()) { if MARKDOWN_EXTENSIONS.contains(&ext.to_lowercase().as_str()) { return markdown_to_html(text, base_url, base_dir); } } encode_minimal(text).replace("\n", "
\n") } /// Helper function to build a new `HashSet` from the items slice. fn hashset(items: &[T]) -> std::collections::HashSet where T: Clone + Eq + std::hash::Hash, { items.iter().cloned().collect() } /// Helper function to build a new `HashMap` from a slice of key-value pairs. fn hashmap(items: &[(K, V)]) -> std::collections::HashMap where K: Clone + Eq + std::hash::Hash, V: Clone, { items.iter().cloned().collect() } #[cfg(test)] mod tests { use super::*; #[test] fn empty_text() { let text = ""; let result = markdown_to_html(text, None, ""); assert_eq!(result, ""); } #[test] fn text_with_script_tag() { let text = "foo_readme\n\n"; let result = markdown_to_html(text, None, ""); assert_eq!( result, "

foo_readme

\n<script>alert(\'Hello World\')</script>\n" ); } #[test] fn text_with_iframe_tag() { let text = "foo_readme\n\n"; let result = markdown_to_html(text, None, ""); assert_eq!( result, "

foo_readme

\n<iframe>alert(\'Hello World\')</iframe>\n" ); } #[test] fn text_with_unknown_tag() { let text = "foo_readme\n\nalert('Hello World')"; let result = markdown_to_html(text, None, ""); assert_eq!(result, "

foo_readme

\n

alert(\'Hello World\')

\n"); } #[test] fn text_with_inline_javascript() { let text = r#"foo_readme\n\nCrate page"#; let result = markdown_to_html(text, None, ""); assert_eq!( result, "

foo_readme\\n\\nCrate page

\n" ); } // See https://github.com/kivikakk/comrak/issues/37. This panic happened // in comrak 0.1.8 but was fixed in 0.1.9. #[test] fn text_with_fancy_single_quotes() { let text = "wb’"; let result = markdown_to_html(text, None, ""); assert_eq!(result, "

wb’

\n"); } #[test] fn code_block_with_syntax_highlighting() { let code_block = r#"```rust \ println!("Hello World"); \ ```"#; let result = markdown_to_html(code_block, None, ""); assert!(result.contains("")); } #[test] fn code_block_with_syntax_highlighting_even_if_annot_has_no_run() { let code_block = r#"```rust , no_run \ println!("Hello World"); \ ```"#; let result = markdown_to_html(code_block, None, ""); assert!(result.contains("")); } #[test] fn text_with_forbidden_class_attribute() { let text = "

Hello World!

"; let result = markdown_to_html(text, None, ""); assert_eq!(result, "

Hello World!

\n"); } #[test] fn relative_links() { let absolute = "[hi](/hi)"; let relative = "[there](there)"; let image = "![alt](img.png)"; let html_image = "\"alt\""; let svg = "![alt](sanitize.svg)"; for host in &["github.com", "gitlab.com", "bitbucket.org"] { for (&extra_slash, &dot_git) in [true, false].iter().zip(&[true, false]) { let url = format!( "https://{}/rust-lang/test{}{}", host, if dot_git { ".git" } else { "" }, if extra_slash { "/" } else { "" }, ); let result = markdown_to_html(absolute, Some(&url), ""); assert_eq!( result, format!( "

hi

\n", host ) ); let result = markdown_to_html(relative, Some(&url), ""); assert_eq!( result, format!( "

there

\n", host ) ); let result = markdown_to_html(image, Some(&url), ""); assert_eq!( result, format!( "

\"alt\"

\n", host ) ); let result = markdown_to_html(html_image, Some(&url), ""); assert_eq!( result, format!( "\"alt\"\n", host ) ); let result = markdown_to_html(svg, Some(&url), ""); assert_eq!( result, format!( "

\"alt\"

\n", host ) ); let result = markdown_to_html(svg, Some(&url), "subdir"); assert_eq!( result, format!( "

\"alt\"

\n", host ) ); let result = markdown_to_html(svg, Some(&url), "subdir1/subdir2"); assert_eq!( result, format!( "

\"alt\"

\n", host ) ); } } let result = markdown_to_html(absolute, Some("https://google.com/"), ""); assert_eq!( result, "

hi

\n" ); } #[test] fn absolute_links_dont_get_resolved() { let text = "[![Crates.io](https://img.shields.io/crates/v/clap.svg)](https://crates.io/crates/clap)"; let repository = "https://github.com/kbknapp/clap-rs/"; let result = markdown_to_html(text, Some(repository), ""); assert_eq!( result, "

\"Crates.io\"

\n" ); } #[test] fn text_to_html_renders_markdown() { for f in &[ "README", "readme.md", "README.MARKDOWN", "whatever.mkd", "s/readme.md", "s1/s2/readme.md", ] { assert_eq!( text_to_html("*lobster*", f, None, None), "

lobster

\n" ); } assert_eq!( text_to_html("*[lobster](docs/lobster)*", "readme.md", Some("https://github.com/rust-lang/test"), None), "

lobster

\n" ); assert_eq!( text_to_html("*[lobster](docs/lobster)*", "s/readme.md", Some("https://github.com/rust-lang/test"), None), "

lobster

\n" ); assert_eq!( text_to_html("*[lobster](docs/lobster)*", "s1/s2/readme.md", Some("https://github.com/rust-lang/test"), None), "

lobster

\n" ); assert_eq!( text_to_html("*[lobster](docs/lobster)*", "s1/s2/readme.md", Some("https://github.com/rust-lang/test"), Some("path/in/vcs/")), "

lobster

\n" ); assert_eq!( text_to_html("*[lobster](docs/lobster)*", "s1/s2/readme.md", Some("https://github.com/rust-lang/test"), Some("path/in/vcs")), "

lobster

\n" ); } #[test] fn text_to_html_renders_other_things() { for f in &["readme.exe", "readem.org", "blah.adoc"] { assert_eq!( text_to_html("\n\nis my friend\n", f, None, None), "<script>lobster</script>
\n
\nis my friend
\n" ); } } #[test] fn header_has_tags() { let text = "# My crate\n\nHello, world!\n"; let result = markdown_to_html(text, None, ""); assert_eq!( result, "

My crate

\n

Hello, world!

\n" ); } #[test] fn manual_anchor_is_sanitized() { let text = "

My crate

\n

Hello, world!

\n"; let result = markdown_to_html(text, None, ""); assert_eq!( result, "

My crate

\n

Hello, world!

\n" ); } #[test] fn tables_with_rowspan_and_colspan() { let text = "
Target
\n"; let result = markdown_to_html(text, None, ""); assert_eq!( result, "
Target
\n" ); } #[test] fn text_alignment() { let text = "

foo-bar

\n
Hello World!
\n"; let result = markdown_to_html(text, None, ""); assert_eq!( result, "

foo-bar

\n
Hello World!
\n" ); } #[test] fn image_alignment() { let text = "

\"\"

\n"; let result = markdown_to_html(text, None, ""); assert_eq!( result, "

\"\"

\n" ); } }