| Crates.io | halldyll-core |
| lib.rs | halldyll-core |
| version | 0.1.0 |
| created_at | 2026-01-20 23:48:45.269434+00 |
| updated_at | 2026-01-20 23:48:45.269434+00 |
| description | Core scraping engine for Halldyll - high-performance async web scraper for AI agents |
| homepage | https://github.com/Mr-soloDev/halldyll-Scrapper |
| repository | https://github.com/Mr-soloDev/halldyll-Scrapper |
| max_upload_size | |
| id | 2057865 |
| size | 421,881 |
High-performance async web scraper written in Rust with Python bindings, designed for AI agents and cloud deployments.
halldyll/
โโโ halldyll-core โ Orchestration, HTTP client, rate limiting, storage (63 tests)
โโโ halldyll-parser โ HTML parsing, text/link/metadata extraction (220 tests)
โโโ halldyll-media โ Image, video, audio, document extraction (118 tests)
โโโ halldyll-robots โ robots.txt parsing and caching (45 tests)
โโโ halldyll-python โ Python bindings via PyO3 (8 tests)
Total: 452 tests passing โ
Add to your Cargo.toml:
[dependencies]
halldyll-core = "0.1"
tokio = { version = "1", features = ["full"] }
use halldyll_core::{Orchestrator, Config};
use url::Url;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Use cloud-optimized config (polite, production-ready)
let config = Config::cloud();
let orchestrator = Orchestrator::new(config)?;
let url = Url::parse("https://example.com")?;
let result = orchestrator.scrape(&url).await?;
println!("Title: {:?}", result.document.title);
println!("Text: {} chars", result.document.main_text.len());
println!("Links found: {}", result.discovered_links.len());
Ok(())
}
pip install halldyll
from halldyll import scrape, HalldyllScraper, ScraperConfig
# Simple one-liner
result = scrape("https://example.com")
print(result.title, result.text[:200])
# With configuration
config = ScraperConfig.cloud() # Production-ready settings
with HalldyllScraper(config) as scraper:
results = scraper.scrape_batch([
"https://example.com",
"https://rust-lang.org",
"https://python.org"
])
for r in results:
if r.success:
print(f"{r.url}: {r.word_count} words")
else:
print(f"{r.url}: Error - {r.error}")
| Preset | Use Case | Settings |
|---|---|---|
Config::default() |
General use | 2 concurrent/domain, 100ms delay, robots.txt on |
Config::cloud() |
Production/AI agents | 1 concurrent/domain, 1s delay, 30s timeout, metrics on |
Config::polite() |
Sensitive targets | 1 concurrent/domain, 3s delay, strict limits |
Config::fast() |
Dev/testing only | 10 concurrent/domain, no robots.txt โ ๏ธ |
use halldyll_core::Config;
let mut config = Config::default();
// HTTP settings
config.fetch.user_agent = "MyBot/1.0".to_string();
config.fetch.total_timeout_ms = 30000;
config.fetch.max_retries = 3;
// Politeness
config.politeness.respect_robots_txt = true;
config.politeness.default_delay_ms = 1000;
config.politeness.max_concurrent_per_domain = 2;
// Extraction
config.parse.extract_json_ld = true;
config.parse.extract_images = true;
config.parse.segment_text = true;
config.parse.chunk_size = 1000;
// Security
config.security.block_private_ips = true;
config.security.max_response_size = 10 * 1024 * 1024; // 10MB
from halldyll import ScraperConfig
config = ScraperConfig(
user_agent="MyBot/1.0",
connect_timeout_ms=5000,
max_concurrent=2,
respect_robots=True,
max_depth=5
)
from langchain.tools import Tool
from halldyll import scrape
def scrape_url(url: str) -> str:
"""Scrape a webpage and return its content."""
result = scrape(url)
if result.success:
return f"Title: {result.title}\n\nContent:\n{result.text[:5000]}"
return f"Error: {result.error}"
scrape_tool = Tool(
name="web_scraper",
description="Scrape a webpage to get its text content. Input: URL",
func=scrape_url
)
# Use in your agent
agent.tools.append(scrape_tool)
from crewai import Agent, Task
from halldyll import HalldyllScraper, ScraperConfig
config = ScraperConfig.cloud()
scraper = HalldyllScraper(config)
researcher = Agent(
role="Web Researcher",
goal="Extract information from websites",
tools=[scraper]
)
task = Task(
description="Research the latest Rust features from rust-lang.org",
agent=researcher
)
from agent_framework import Agent, tool
from halldyll import scrape, HalldyllScraper, ScraperConfig
@tool
def web_scrape(url: str) -> dict:
"""Scrape a webpage and extract its content."""
result = scrape(url)
return {
"title": result.title,
"text": result.text[:3000],
"links": result.links[:10],
"images": result.images[:5]
}
agent = Agent(
name="research_agent",
tools=[web_scrape],
model="gpt-4o"
)
response = await agent.run("Research and summarize https://example.com")
from halldyll import HalldyllScraper, ScraperConfig
config = ScraperConfig.cloud()
with HalldyllScraper(config) as scraper:
urls = [
"https://docs.python.org/3/tutorial/",
"https://doc.rust-lang.org/book/",
# ... more URLs
]
results = scraper.scrape_batch(urls)
# Prepare for vector database
documents = []
for r in results:
if r.has_content:
documents.append({
"url": r.url,
"title": r.title,
"text": r.text,
"metadata": r.to_dict()
})
# Insert into your vector DB (Pinecone, Weaviate, Qdrant, etc.)
vector_db.upsert(documents)
apiVersion: apps/v1
kind: Deployment
metadata:
name: halldyll-scraper
spec:
replicas: 3
template:
spec:
containers:
- name: scraper
image: your-registry/halldyll:latest
ports:
- containerPort: 8080
livenessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
readinessProbe:
httpGet:
path: /readyz
port: 8080
initialDelaySeconds: 5
periodSeconds: 5
resources:
requests:
memory: "64Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "500m"
use halldyll_core::{
HealthChecker, HealthMetrics, PrometheusExporter,
MetricsCollector, GracefulShutdown,
};
use std::sync::Arc;
// Health checker
let health = HealthChecker::default_config();
// GET /healthz - Liveness probe
let liveness = health.liveness();
// Returns: {"status": "healthy", "uptime_secs": 3600, ...}
// GET /readyz - Readiness probe
let metrics = HealthMetrics {
success_rate: 0.98,
avg_latency_ms: 150.0,
open_circuits: 0,
memory_mb: Some(128),
active_requests: 5,
};
let readiness = health.readiness(&metrics);
// GET /metrics - Prometheus format
let collector = MetricsCollector::new();
let exporter = PrometheusExporter::new(&collector);
let prometheus_output = exporter.export();
// Returns: halldyll_requests_total 1234
// halldyll_success_rate 0.98
// ...
// Graceful shutdown
let shutdown = Arc::new(GracefulShutdown::default_timeout());
// On SIGTERM: shutdown.wait_for_completion().await
use halldyll_core::{CircuitBreaker, CircuitBreakerConfig};
// Production config: tolerant, slow recovery
let breaker = CircuitBreaker::new(CircuitBreakerConfig::production());
// Before each request
if !breaker.allow_request("example.com") {
// Domain circuit is open, skip or queue for later
continue;
}
// After request
match result {
Ok(_) => breaker.record_success("example.com"),
Err(e) if e.is_timeout() => breaker.record_timeout("example.com"),
Err(e) if e.is_server_error() => breaker.record_server_error("example.com"),
Err(_) => breaker.record_failure("example.com"),
}
// Monitor open circuits
let open = breaker.get_open_circuits();
println!("Failing domains: {:?}", open);
from halldyll import (
scrape,
HalldyllError, # Base exception
NetworkError, # Connection, timeout, DNS
HttpError, # 4xx, 5xx status codes
ParseError, # HTML parsing failures
RateLimitError, # 429 Too Many Requests
RobotsError, # Blocked by robots.txt
ValidationError, # Invalid URL
)
try:
result = scrape("https://example.com")
except NetworkError as e:
print(f"Network issue: {e}")
# Retry with backoff
except RateLimitError as e:
print(f"Rate limited: {e}")
# Wait and retry
except RobotsError as e:
print(f"Blocked by robots.txt: {e}")
# Skip this URL
except HalldyllError as e:
print(f"Scraper error: {e}")
| Feature | Description |
|---|---|
| Main Text | Boilerplate removal, clean content extraction |
| Title | Page title with fallbacks (og:title, h1) |
| Description | Meta description, og:description |
| JSON-LD | Structured data (Schema.org) |
| OpenGraph | Social media metadata |
| Images | URLs, dimensions, alt text, lazy-load resolution |
| Videos | YouTube, Vimeo, embedded videos |
| Audio | Podcast feeds, audio embeds |
| Links | Internal/external classification, anchor text |
| Canonical URL | Resolved canonical URL |
| Pagination | Next/prev page detection |
Each crate can be used independently:
# Just the parser
[dependencies]
halldyll-parser = "0.1"
# Just robots.txt
[dependencies]
halldyll-robots = "0.1"
# Just media extraction
[dependencies]
halldyll-media = "0.1"
// Use parser standalone
use halldyll_parser::HtmlParser;
let html = r#"<html><body><h1>Hello</h1><p>World</p></body></html>"#;
let parser = HtmlParser::new(html);
let text = parser.extract_text();
let links = parser.extract_links("https://example.com");
let agents = vec![
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Safari/17.0",
"Mozilla/5.0 (X11; Linux x86_64) Firefox/121.0",
];
for (i, url) in urls.iter().enumerate() {
config.fetch.user_agent = agents[i % agents.len()].to_string();
// ...
}
| Metric | Halldyll | Scrapy | Playwright |
|---|---|---|---|
| Speed (pages/min) | ~500 | ~150 | ~50 |
| Memory (10K pages) | ~50 MB | ~300 MB | ~800 MB |
| Startup time | <100ms | ~2s | ~5s |
# Run all tests (452 tests)
cargo test --workspace
# Run with output
cargo test --workspace -- --nocapture
# Run specific crate
cargo test -p halldyll-parser
cargo test -p halldyll-media
cargo test -p halldyll-robots
halldyll-scrapper/
โโโ Cargo.toml # Rust workspace
โโโ crates/
โ โโโ halldyll-core/ # Core scraping engine
โ โ โโโ src/
โ โ โโโ fetch/ # HTTP client, circuit breaker
โ โ โโโ observe/ # Metrics, health, shutdown
โ โ โโโ storage/ # Dedup, content store
โ โ โโโ types/ # Config, errors
โ โโโ halldyll-parser/ # HTML extraction (220 tests)
โ โโโ halldyll-media/ # Media extraction (118 tests)
โ โโโ halldyll-robots/ # robots.txt (45 tests)
โ โโโ halldyll-python/ # PyO3 bindings
โโโ examples/ # Usage examples
โโโ README.md
MIT License - see LICENSE file.
Geryan Roy