use std::io::Read; use std::{ env, fs, time::{Duration, SystemTime, UNIX_EPOCH}, }; use anyhow::Context; use html5ever::{parse_document, tendril::TendrilSink}; use markup5ever_rcdom::RcDom; use dawnsearch::extract::{extract, extract_text, find_links}; use postgres::{Client, NoTls}; use url::{ParseError, Url}; fn main() -> anyhow::Result<()> { let args: Vec = env::args().collect(); let restart = args.contains(&"--restart".to_owned()); fs::create_dir_all("store")?; let mut client = Client::connect( "host=localhost dbname=dawnsearch user=dawnsearch password=dawnsearch", NoTls, )?; // Create DB structure client.execute( "CREATE TABLE IF NOT EXISTS page ( host TEXT NOT NULL, path TEXT NOT NULL, discovered BIGINT NOT NULL, crawled BIGINT )", &[], )?; client.execute( " CREATE INDEX IF NOT EXISTS url_find_to_crawl on page(crawled, discovered) ", &[], )?; client.execute( " CREATE UNIQUE INDEX IF NOT EXISTS find_exists on page(host, path) ", &[], )?; client.execute( "CREATE TABLE IF NOT EXISTS host ( url TEXT NOT NULL, pages INTEGER DEFAULT 0 NOT NULL )", &[], )?; client.execute( " CREATE UNIQUE INDEX IF NOT EXISTS host_unique on host(url) ", &[], )?; client.execute( " CREATE INDEX IF NOT EXISTS find_host on host(pages) ", &[], )?; if restart { client.execute("UPDATE page SET crawled = NULL", &[])?; client.execute("UPDATE host SET pages = 0", &[])?; } fn add_host(conn: &mut Client, host: &str) -> anyhow::Result<()> { conn.execute( "INSERT INTO host (url, pages) VALUES ($1, 0) ON CONFLICT(url) DO NOTHING", &[&host], )?; Ok(()) } fn request_for_host(conn: &mut Client, host: &str) -> anyhow::Result<()> { conn.execute( "INSERT INTO host (url, pages) VALUES ($1, 1) ON CONFLICT(url) DO UPDATE SET pages = host.pages + 1", &[&host], ).context("request_for_host")?; Ok(()) } fn timestamp() -> u64 { SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap() .as_secs() } fn add_link(mut conn: &mut Client, url: &str) -> anyhow::Result<()> { if let Ok(parsed) = Url::parse(url) { if parsed.port().is_some() { println!("Ignoring URL with port: {}", url); return Ok(()); }; // Note that we silently convert all links to HTTPS. let host = format!("{}://{}", parsed.scheme(), parsed.host().expect("host")); let path = parsed.path(); let now = timestamp() as i64; let result = conn.execute( "INSERT INTO page (host, path, discovered) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING", &[&host, &path, &now], )?; if result > 0 { // println!("Discovered {}", url); add_host(&mut conn, &host)?; } } else { println!("Ignoring invalid URL: {}", url); } Ok(()) } fn join_if_needed(base: &Url, input: &str) -> anyhow::Result { match Url::parse(input) { Ok(url) => Ok(url), Err(ParseError::RelativeUrlWithoutBase) => Ok(base.join(input)?), error => Ok(error?), } } fn clean_url(base: &Url, input: &str) -> anyhow::Result { let url = join_if_needed(base, input)?; let scheme = url.scheme(); let host = url.host().context("no host")?.to_string(); let path = url.path(); Ok(if let Some(port) = url.port() { format!("{}://{}:{}{}", scheme, host, port, path) } else { format!("{}://{}{}", scheme, host, path) }) } for url in env::args().skip(1) { if url.starts_with("--") { continue; } println!("Adding {} to the list of URL's to crawl", url); add_link(&mut client, &url)?; } // Let's go crawl! fn agent() -> ureq::Agent { ureq::AgentBuilder::new() .user_agent("Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; DawnSearcht/0.1; https://localhost/todo/fill/this/in) Chrome/115.0.0.0 Safari/537.36") .max_idle_connections(0) .timeout(Duration::from_secs(2)) .build() } let find_to_crawl = client.prepare( "SELECT page.host, path, discovered FROM page INNER JOIN host ON page.host = host.url WHERE crawled IS NULL ORDER BY host.pages ASC LIMIT 1", )?; let mark_crawled = client.prepare("UPDATE page SET crawled = $3 WHERE host = $1 AND path = $2")?; let mut pages_crawled = 0; loop { let rows = client.query(&find_to_crawl, &[])?; let mut found_some = false; for row in rows { let host: String = row.get(0); let path: String = row.get(1); let discovered: i64 = row.get(2); found_some = true; let now = timestamp() as i64; client.execute(&mark_crawled, &[&host, &path, &now])?; let url = format!("{}{}", host, path); // Let's crawl. request_for_host(&mut client, &host)?; let response = match agent().get(&url).call() { Ok(r) => r, Err(e) => { println!("Failed to load {}: {}", e, url); continue; } }; let mut body = response.into_reader().take(1024 * 250); let url = Url::parse(&url).unwrap(); // TODO: main page of wikipedia does not extract correctly. Firefox reader works. let mut dom = match parse_document(RcDom::default(), Default::default()) .from_utf8() .read_from(&mut body) { Ok(dom) => dom, Err(e) => { println!("Failed to read {}: {}", e, url); continue; } }; let mut links = Vec::new(); find_links(&dom.document, &mut links); for link in links { if let Ok(link_url) = clean_url(&url, &link.href) { add_link(&mut client, &link_url)?; } } let (cleaned_document, _title) = extract(&mut dom, &url); let mut clean: String = String::new(); extract_text(&cleaned_document, &mut clean, true); pages_crawled += 1; println!("{} {} {}", pages_crawled, url, discovered); // println!("{}", clean); // std::thread::sleep(Duration::from_secs(2)); } if !found_some { break; } } Ok(()) }