Crates.io | parsercher |
lib.rs | parsercher |
version | 3.1.6 |
source | src |
created_at | 2021-05-07 17:03:29.507889 |
updated_at | 2022-12-26 15:15:10.756585 |
description | Parses and searches Tag documents. (e.g. HTML, XML) |
homepage | |
repository | https://github.com/kkmtyyz/parsercher |
max_upload_size | |
id | 394299 |
size | 139,049 |
Parses and searches Tag documents. (e.g. HTML, XML)
parsercher parses documents written in tags such as HTML and XML.
Add this to your Cargo.toml
:
[dependencies]
parsercher = "3.1.6"
MIT OR Apache-2.0
Example of getting text from HTML.
Create a tree of Dom structure from HTML and get the text of li
tag that value of class
attribute is target
.
use parsercher;
use parsercher::dom::Tag;
let html = r#"
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>sample html</title>
</head>
<body>
<ol>
<li class="target">first</li>
<li>second</li>
<li class="target">therd</li>
</ol>
</body>
</html>
"#;
if let Ok(root_dom) = parsercher::parse(&html) {
let mut needle = Tag::new("li");
needle.set_attr("class", "target");
if let Some(texts) = parsercher::search_text_from_tag_children(&root_dom, &needle) {
assert_eq!(texts.len(), 2);
assert_eq!(texts[0], "first".to_string());
assert_eq!(texts[1], "therd".to_string());
}
}
Example of searching a subtree from the Dom structure tree.
Find a subtree that has a ul
tag whose value in the class
attribute is targetList
and
two li
tags under it. Also, the values of the class
attribute of the li
tag must be
key1
and key2
, respectively.
Looking for:
<ul class="targetList">
<li class="key1"></li>
<li class="key2"></li>
</ul>
use parsercher;
let doc = r#"
<body>
<ul id="list1" class="targetList">
<li class="key1">1-1</li>
<li class="key2">
<span>1-2</span>
</li>
</ul>
<ul id="list2">
<li class="key1">2-1</li>
<li>2-2</li>
</ul>
<div>
<div>
<ul class="targetList">
<ul id="list3" class="targetList">
<li class="key1">3-1</li>
<li class="item">3-2</li>
<li class="key2">3-3</li>
</ul>
</ul>
</div>
</div>
<ul id="list4">
<li class="key1">4-1</li>
<li class="key2">4-2</li>
</ul>
</body>
"#;
let root_dom = parsercher::parse(&doc).unwrap();
let needle = r#"
<ul class="targetList">
<li class="key1"></li>
<li class="key2"></li>
</ul>
"#;
let result = root_dom.search(&needle).unwrap().unwrap();
for dom in result.iter() {
parsercher::print_dom_tree(&dom);
}
output:
<ul class="targetList" id="list1">
<li class="key1">
TEXT: "1-1"
<li class="key2">
<span>
TEXT: "1-2"
<ul class="targetList" id="list3">
<li class="key1">
TEXT: "3-1"
<li class="item">
TEXT: "3-2"
<li class="key2">
TEXT: "3-3"
More complex examples of Dom structure tree
use parsercher;
let html = r#"
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>sample html</title>
</head>
<body>
<h1>Hello, world!</h1>
<div id="content"></div>
<ol>
<li>first</li>
<li>second</li>
<li>therd</li>
</ol>
<!-- All script code becomes one text -->
<script>
let content = document.getElementById('content');
content.textContent = 'content';
</script>
</body>
</html>
"#;
if let Ok(dom) = parsercher::parse(&html) {
println!("{:#?}", dom);
}
output:
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "root",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "!DOCTYPE",
attr: Some(
{
"html": "",
},
),
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: None,
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "html",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "head",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "meta",
attr: Some(
{
"charset": "UTF-8",
},
),
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: None,
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "title",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "sample html",
},
),
comment: None,
children: None,
},
],
),
},
],
),
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "body",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "h1",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "Hello, world!",
},
),
comment: None,
children: None,
},
],
),
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "div",
attr: Some(
{
"id": "content",
},
),
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: None,
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "ol",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "li",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "first",
},
),
comment: None,
children: None,
},
],
),
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "li",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "second",
},
),
comment: None,
children: None,
},
],
),
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "li",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "therd",
},
),
comment: None,
children: None,
},
],
),
},
],
),
},
Dom {
dom_type: Comment,
tag: None,
text: None,
comment: Some(
Comment {
comment: " All script code becomes one text ",
},
),
children: None,
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "script",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "\n let content = document.getElementById(\'content\');\n content.textContent = \'content\';\n",
},
),
comment: None,
children: None,
},
],
),
},
],
),
},
],
),
},
],
),
}