use lazy_static::lazy_static;
use nipper::Document;
use nipper::Selection;
use regex::Regex;
use std::cmp::min;
use std::collections::HashMap;
use std::env;
use std::fs::File;
use std::io::Read;
use std::ops::Deref;
use std::time::Instant;
lazy_static! {
static ref RE_REPLACE_BRS: Regex = Regex::new(r#"(?is)(
]*>[ \n\r\t]*){2,}"#).unwrap();
static ref RE_TITLE_SEPARATOR: Regex = Regex::new(r#"(?is) [\|\-\\/>»] "#).unwrap();
static ref RE_TITLE_HIERARCHY_SEP: Regex = Regex::new(r#"(?is)[\\/>»]"#).unwrap();
static ref RE_BY_LINE: Regex = Regex::new(r#"(?is)byline|author|dateline|writtenby|p-author"#).unwrap();
static ref RE_UNLIKELY_CANDIDATES: Regex = Regex::new(r#"(?is)banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote|subscribe"#).unwrap();
static ref RE_OK_MAYBE_CANDIDATE: Regex = Regex::new(r#"`(?is)and|article|body|column|main|shadow"#).unwrap();
static ref RE_UNLIKELY_ELEMENTS: Regex = Regex::new(r#"(?is)(input|time|button|svg)"#).unwrap();
static ref RE_LIKELY_ELEMENTS: Regex = Regex::new(r#"(?is)(no-svg)"#).unwrap();
static ref RE_POSITIVE: Regex = Regex::new(r#"(?is)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story|paragraph"#).unwrap();
static ref RE_NEGATIVE: Regex = Regex::new(r#"(?is)hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget"#).unwrap();
static ref RE_DIV_TO_P_ELEMENTS: Regex = Regex::new(r#"(?is)<(a|blockquote|dl|div|img|ol|p|pre|table|ul|select)"#).unwrap();
static ref RE_VIDEOS: Regex = Regex::new(r#"(?is)//(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com"#).unwrap();
static ref RE_P_IS_SENTENCE: Regex = Regex::new(r#"(?is)\.( |$)"#).unwrap();
static ref RE_COMMENTS: Regex = Regex::new(r#"(?is)"#).unwrap();
static ref RE_KILL_BREAKS: Regex = Regex::new(r#"(?is)(
(\s| ?)*)+"#).unwrap();
static ref RE_SPACES: Regex = Regex::new(r#"(?is)\s{2,}|\n+"#).unwrap();
}
const DATA_TABLE_ATTR: &'static str = "XXX-DATA-TABLE";
macro_rules! is_valid_by_line {
($text: expr) => {
$text.len() > 0 && $text.len() < 100
};
}
macro_rules! is_element_without_content {
($sel: expr) => {{
let text = $sel.text();
text.trim() == ""
}};
}
macro_rules! has_single_p_inside_element {
($sel: expr) => {{
let children = $sel.children();
children.length() == 1 && children.is("p")
}};
}
macro_rules! has_child_block_element {
($sel: expr) => {{
let html = $sel.html();
RE_DIV_TO_P_ELEMENTS.is_match(&html)
}};
}
macro_rules! get_node_ancestors {
($sel:expr, $depth: expr) => {{
let mut ancestors = vec![];
let mut parent = $sel.parent();
for _ in 0..$depth {
if parent.length() == 0 {
break;
} else {
ancestors.push(parent.clone());
parent = parent.parent();
}
}
ancestors
}};
}
macro_rules! set_node_tag {
($sel: expr, $tag: expr) => {{
let html = $sel.html();
let new_html = format!("<{}>{}<{}>", $tag, html, $tag);
$sel.replace_with_html(new_html.as_str());
}};
}
macro_rules! get_class_or_id_weight {
($sel: expr) => {{
let mut weight = 0.0;
let score = 45.0;
if let Some(class) = $sel.attr("class") {
let class = &class.to_lowercase();
if RE_NEGATIVE.is_match(class) {
weight -= score;
}
if RE_POSITIVE.is_match(class) {
weight += score;
}
}
if let Some(id) = $sel.attr("id") {
let id = &id.to_lowercase();
if RE_NEGATIVE.is_match(id) {
weight -= score;
}
if RE_POSITIVE.is_match(id) {
weight += score;
}
}
weight
}};
}
#[derive(Debug)]
struct MetaData {
title: Option
");
html = &r;
body.set_html(html);
body.select("p").iter().for_each(|mut p| {
let html: &str = &p.html();
if html.trim() == "" {
p.remove();
}
});
}
fn prep_document(doc: &Document) {
replace_brs(&doc);
doc.select("font").iter().for_each(|mut font| {
let html: &str = &font.html();
let mut new_html = "".to_string();
new_html.push_str(html);
new_html.push_str("");
font.replace_with_html(new_html.as_str());
})
}
fn get_article_metadata(doc: &Document) -> MetaData {
let mut metadata = MetaData::default();
doc.select("meta").iter().for_each(|meta| {
let name = meta.attr_or("name", "");
let property = meta.attr_or("property", "");
let content = meta.attr_or("content", "");
if content.deref() == "" {
return;
}
if name.contains("author") || property.contains("author") {
metadata.author = Some(content.to_string());
}
if property.deref() == "og:image" || name.deref() == "twitter:image" {
metadata.cover = Some(content.to_string());
}
if name.deref() == "description"
|| property.deref() == "og:description"
|| name.deref() == "twitter:description"
{
metadata.description = Some(content.to_string());
}
if property.deref() == "og:title" || name.deref() == "twitter:title" {
metadata.title = Some(content.to_string());
}
});
if metadata.title.is_none() {
metadata.title = get_article_title(doc);
}
metadata
}
fn get_article_title(doc: &Document) -> Option
");
let html = RE_SPACES.replace_all(&html, "");
html.to_string()
}
fn pre_article(content: &Selection, title: &str) {
mark_data_tables(&content);
remove_attrs(&content);
remove_conditionally(&content, "form");
remove_conditionally(&content, "fieldset");
remove_tag(&content, "h1");
remove_tag(&content, "object");
remove_tag(&content, "embed");
remove_tag(&content, "footer");
remove_tag(&content, "link");
content.select("*").iter().for_each(|mut s| {
let id = s.attr_or("id", "");
let class = s.attr_or("class", "");
let match_str = format!("{} {}", id, class);
if match_str.contains("share") {
s.remove();
}
});
let mut h2s = content.select("h2");
if h2s.length() == 1 {
let text = h2s.text();
println!("{} {}", text.len(), title.len());
let length_similar_rate = text.len() as f64 / title.len() as f64 - 1.0;
if length_similar_rate.abs() < 0.5 {
let title_matches = if length_similar_rate > 0.0 {
text.contains(title)
} else {
title.contains(text.deref())
};
if title_matches {
h2s.remove()
}
}
}
remove_tag(&content, "iframe");
remove_tag(&content, "input");
remove_tag(&content, "textarea");
remove_tag(&content, "select");
remove_tag(&content, "button");
remove_headers(&content);
remove_conditionally(&content, "table");
remove_conditionally(&content, "ul");
// remove_conditionally(&content, "div");
content.select("p").iter().for_each(|mut p| {
let img = p.select("img").length();
let embed = p.select("embed").length();
let object = p.select("object").length();
let iframe = p.select("iframe").length();
let total = img + embed + object + iframe;
let p_text = p.text();
if total == 0 && p_text.len() == 0 {
p.remove()
}
});
content.select("br").iter().for_each(|mut br| {
if br.next_sibling().is("p") {
br.remove()
}
})
}
fn mark_data_tables(s: &Selection) {
let data_table_descendants = vec!["col", "colgroup", "tfoot", "thead", "th"];
s.select("table").iter().for_each(|mut table| {
let role = table.attr_or("role", "");
if role.deref() == "presentation" {
return;
}
let datatable = table.attr_or("datatable", "");
if datatable.deref() == "0" {
return;
}
if table.attr("summary").is_some() {
table.set_attr(DATA_TABLE_ATTR, "1");
return;
}
let caption = table.select("caption");
if caption.length() > 0 && caption.children().length() > 0 {
table.set_attr(DATA_TABLE_ATTR, "1");
return;
}
for tag in &data_table_descendants {
if table.select(tag).length() > 0 {
table.set_attr(DATA_TABLE_ATTR, "1");
return;
}
}
if table.select("table").length() > 0 {
return;
}
let (rows, colums) = get_table_row_and_column_count(&table);
if rows > 10 || colums > 4 {
table.set_attr(DATA_TABLE_ATTR, "1");
return;
}
if rows * colums > 10 {
table.set_attr(DATA_TABLE_ATTR, "1");
return;
}
})
}
fn get_table_row_and_column_count(table: &Selection) -> (usize, usize) {
let mut rows = 0;
let mut columns = 0;
table.select("tr").iter().for_each(|tr| {
let str_row_span = tr.attr_or("rowspan", "1");
let row_span = str_row_span.parse::