Crates.io | wikidump |
lib.rs | wikidump |
version | 0.3.1 |
source | src |
created_at | 2019-09-14 15:38:54.853438 |
updated_at | 2024-09-01 03:29:34.925602 |
description | A library for parsing Mediawiki XML dumps |
homepage | |
repository | https://github.com/camchenry/wikidump |
max_upload_size | |
id | 164780 |
size | 37,880 |
This crate processes Mediawiki XML dump files and turns them into easily consumed pieces of data for language analysis, natural langauge processing, and other applications.
let parser = Parser::new()
.use_config(config::wikipedia::english());
let site = parser
.parse_file("tests/enwiki-articles-partial.xml")
.expect("Could not parse wikipedia dump file.");
assert_eq!(site.name, "Wikipedia");
assert_eq!(site.url, "https://en.wikipedia.org/wiki/Main_Page");
assert!(!site.pages.is_empty());
for page in site.pages {
println!("Title: {}", page.title);
for revision in page.revisions {
println!("\t{}", revision.text);
}
}