Crates.io | crusty-core |
lib.rs | crusty-core |
version | 0.82.0 |
source | src |
created_at | 2021-05-24 17:53:35.29624 |
updated_at | 2021-11-27 09:02:46.60626 |
description | Library for creating blazing fast and configurable web crawlers |
homepage | |
repository | https://github.com/let4be/crusty-core |
max_upload_size | |
id | 401512 |
size | 159,576 |
TITLE
tagsuse crusty_core::{prelude::*, select_task_expanders::FollowLinks};
#[derive(Debug, Default)]
pub struct JobState {
sum_title_len: usize,
}
#[derive(Debug, Clone, Default)]
pub struct TaskState {
title: String,
}
pub struct DataExtractor {}
type Ctx = JobCtx<JobState, TaskState>;
impl TaskExpander<JobState, TaskState, Document> for DataExtractor {
fn expand(
&self,
ctx: &mut Ctx,
_: &Task,
_: &HttpStatus,
doc: &Document,
) -> task_expanders::Result {
if let Some(title) = doc.find(Name("title")).next().map(|v| v.text()) {
ctx.job_state.lock().unwrap().sum_title_len += title.len();
ctx.task_state.title = title;
}
Ok(())
}
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let crawler = Crawler::new_default()?;
let settings = config::CrawlingSettings::default();
let rules = CrawlingRules::new(CrawlingRulesOptions::default(), document_parser())
.with_task_expander(|| DataExtractor {})
.with_task_expander(|| FollowLinks::new(LinkTarget::HeadFollow));
let job = Job::new("https://example.com", settings, rules, JobState::default())?;
for r in crawler.iter(job) {
println!("- {}, task state: {:?}", r, r.ctx.task_state);
if let JobStatus::Finished(_) = r.status {
println!("final job state: {:?}", r.ctx.job_state.lock().unwrap());
}
}
Ok(())
}
If you want to get more fancy and configure some stuff or control your imports more precisely
use crusty_core::{
config,
select::predicate::Name,
select_task_expanders::{document_parser, Document, FollowLinks},
task_expanders,
types::{HttpStatus, Job, JobCtx, JobStatus, LinkTarget, Task},
Crawler, CrawlingRules, CrawlingRulesOptions, ParserProcessor, TaskExpander,
};
#[derive(Debug, Default)]
pub struct JobState {
sum_title_len: usize,
}
#[derive(Debug, Clone, Default)]
pub struct TaskState {
title: String,
}
pub struct DataExtractor {}
type Ctx = JobCtx<JobState, TaskState>;
impl TaskExpander<JobState, TaskState, Document> for DataExtractor {
fn expand(
&self,
ctx: &mut Ctx,
_: &Task,
_: &HttpStatus,
doc: &Document,
) -> task_expanders::Result {
let title = doc.find(Name("title")).next().map(|v| v.text());
if let Some(title) = title {
ctx.job_state.lock().unwrap().sum_title_len += title.len();
ctx.task_state.title = title;
}
Ok(())
}
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let concurrency_profile = config::ConcurrencyProfile::default();
let parser_profile = config::ParserProfile::default();
let tx_pp = ParserProcessor::spawn(concurrency_profile, parser_profile);
let networking_profile = config::NetworkingProfile::default().resolve()?;
let crawler = Crawler::new(networking_profile, tx_pp);
let settings = config::CrawlingSettings::default();
let rules_opt = CrawlingRulesOptions::default();
let rules = CrawlingRules::new(rules_opt, document_parser())
.with_task_expander(|| DataExtractor {})
.with_task_expander(|| FollowLinks::new(LinkTarget::HeadFollow));
let job = Job::new("https://example.com", settings, rules, JobState::default())?;
for r in crawler.iter(job) {
println!("- {}, task state: {:?}", r, r.ctx.task_state);
if let JobStatus::Finished(_) = r.status {
println!("final job state: {:?}", r.ctx.job_state.lock().unwrap());
}
}
Ok(())
}
Simply add this to your Cargo.toml
[dependencies]
crusty-core = {version = "~0.82.0", features=["select_rs"]}
if you need just library without built-in select.rs
task expanders(for links, images, etc)
[dependencies]
crusty-core = "~0.82.0"
make sure rustup
is installed: https://rustup.rs/
make sure pre-commit
is installed: https://pre-commit.com/
make sure markdown-pp
is installed: https://github.com/jreese/markdown-pp
run ./go setup
run ./go check
to run all pre-commit hooks and ensure everything is ready to go for git
run ./go release minor
to release a next minor version for crates.io
Please see examples for more complicated usage scenarios. This crawler is more verbose than some others, but it allows incredible customization at each and every step.
If you are interested in the area of broad web crawling there's crusty, developed fully on top of crusty-core
that tries to tackle on some challenges of broad web crawling