Crates.io | scraper_query |
lib.rs | scraper_query |
version | 0.4.0 |
source | src |
created_at | 2024-10-22 11:40:21.716065 |
updated_at | 2024-11-08 07:23:47.298118 |
description | Ergonomic Query for HTML with Scraper |
homepage | https://github.com/ifsheldon/scraper_query |
repository | https://github.com/ifsheldon/scraper_query |
max_upload_size | |
id | 1418545 |
size | 23,493 |
scraper_query
is a simple tool for you to query components in HTML documents with scraper
so that you can easily do simple HTML manipulations, which are common in web crawling and web scraping and data cleaning.
use scraper::Html;
use scraper_query::*; // use `HTMLIndex`, `Tag`, `class`, `id`
use markup5ever::interface::tree_builder::TreeSink;
let mut document = Html::parse_document(HTML);
let index = HTMLIndex::new(&document);
// find all nodes with class "foo" and "bar"
let node_ids = index.query(class("foo") & class("bar"));
// find all nodes with id "foo"
let node_ids = index.query(id("foo"));
// find all nodes with tag "h1" and class "foo"
let node_ids = index.query(Tag::H1 & class("foo")); // same as `Tag::H1.and(class("foo"))`
// find all nodes with tag "h1" and not class "foo"
let node_ids = index.query(Tag::H1 & (!class("foo")));
// simple manipulation
for id in node_ids {
document.remove_from_parent(&id);
}