Crates.io | frangipani |
lib.rs | frangipani |
version | 0.3.1 |
source | src |
created_at | 2022-08-20 07:21:16.340402 |
updated_at | 2023-04-03 09:18:55.636108 |
description | Scraping framework for rust |
homepage | https://github.com/aprimadi/frangipani |
repository | https://github.com/aprimadi/frangipani |
max_upload_size | |
id | 649075 |
size | 110,809 |
The goal of this project is to create a configurable and extensible crawler framework.
use async_trait::async_trait;
use frangipani::{Response, Spider};
use frangipani::util::join_url;
use scraper::{Html, Selector};
pub struct DexcodeSpider {
}
#[async_trait]
impl Spider for DexcodeSpider {
fn name(&self) -> String {
"dexcode-spider".to_owned()
}
fn start_urls(&self) -> Vec<String> {
vec![
"https://dexcode.com/".to_owned(),
]
}
async fn parse(&self, response: Response) -> (u64, Vec<String>) {
if response.content_type() != "text/html" {
return (0, vec![]);
}
let url = response.get_url().to_owned();
let text = response.into_string().unwrap();
let mut urls = vec![];
{
let document = Html::parse_document(&text);
let link_selector = Selector::parse("a").unwrap();
for link in document.select(&link_selector) {
if let Some(relative_url) = link.value().attr("href") {
let join_url = join_url(&url, relative_url);
let req_url = reqwest::Url::parse(&join_url).unwrap();
if req_url.scheme() != "http" && req_url.scheme() != "https" {
continue;
}
if req_url.domain().unwrap().ends_with("dexcode.com") {
// Only push url with `dexcode.com` domain
urls.push(req_url.to_string());
}
}
}
let title_selector = Selector::parse("title").unwrap();
let title = match document.select(&title_selector).next() {
Some(el) => el.inner_html(),
None => "".to_owned(),
};
println!("{},{}", url, title);
}
(1, urls)
}
}
#[tokio::main]
async fn main() {
env_logger::init();
let spiders: Vec<Box<dyn Spider + Send + Sync>> = vec![
Box::new(DexcodeSpider {}),
];
let mut engine = frangipani::engine(spiders);
engine.start().await;
}
For continuous crawling, see examples/continuous.rs
in the project repository.