//! Syntax tree definitions with change metadata.
#![allow(clippy::mutable_key_type)] // Hash for Syntax doesn't use mutable fields.
use std::{
cell::Cell,
collections::HashMap,
env, fmt,
hash::{Hash, Hasher},
};
use typed_arena::Arena;
use crate::{
lines::{LineNumber, NewlinePositions},
myers_diff,
positions::SingleLineSpan,
};
use ChangeKind::*;
use Syntax::*;
#[derive(PartialEq, Eq, Clone, Copy)]
pub enum ChangeKind<'a> {
Unchanged(&'a Syntax<'a>),
ReplacedComment(&'a Syntax<'a>, &'a Syntax<'a>),
Novel,
}
/// A Debug implementation that ignores the corresponding node
/// mentioned for Unchanged. Otherwise we will infinitely loop on
/// unchanged nodes, which both point to the other.
impl<'a> fmt::Debug for ChangeKind<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let desc = match self {
Unchanged(_) => "Unchanged",
ReplacedComment(_, _) => "ReplacedComment",
Novel => "Novel",
};
f.write_str(desc)
}
}
/// Fields that are common to both `Syntax::List` and `Syntax::Atom`.
pub struct SyntaxInfo<'a> {
/// The next node with the same parent as this one.
pub next_sibling: Cell>>,
/// The syntax node that occurs before this one, in a depth-first
/// tree traversal.
pub prev: Cell >>,
/// The parent syntax node, if present.
parent: Cell >>,
/// Does the previous syntax node occur on the same line as the
/// first line of this node?
prev_is_contiguous: Cell,
/// Whether or not this syntax node has changed. This value is set
/// when computing the diff with another syntax tree.
pub change: Cell>>,
/// The number of nodes that are ancestors of this one.
num_ancestors: Cell,
/// A number that uniquely identifies this syntax node.
unique_id: Cell,
/// A number that uniquely identifies the content of this syntax
/// node. This may be the same as nodes on the other side of the
/// diff, or nodes at different positions.
///
/// Values are sequential, not hashes. Collisions never occur.
content_id: Cell,
}
impl<'a> SyntaxInfo<'a> {
pub fn new() -> Self {
Self {
next_sibling: Cell::new(None),
prev: Cell::new(None),
parent: Cell::new(None),
prev_is_contiguous: Cell::new(false),
change: Cell::new(None),
num_ancestors: Cell::new(0),
unique_id: Cell::new(0),
content_id: Cell::new(0),
}
}
}
impl<'a> Default for SyntaxInfo<'a> {
fn default() -> Self {
Self::new()
}
}
pub enum Syntax<'a> {
List {
info: SyntaxInfo<'a>,
open_position: Vec,
open_content: String,
children: Vec<&'a Syntax<'a>>,
close_position: Vec,
close_content: String,
num_descendants: u32,
},
Atom {
info: SyntaxInfo<'a>,
position: Vec,
content: String,
kind: AtomKind,
},
}
fn dbg_pos(pos: &[SingleLineSpan]) -> String {
match pos {
[] => "-".into(),
[pos] => format!("{}:{}-{}", pos.line.0, pos.start_col, pos.end_col),
[start, .., end] => format!(
"{}:{}-{}:{}",
start.line.0, start.start_col, end.line.0, end.end_col
),
}
}
impl<'a> fmt::Debug for Syntax<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
List {
open_content,
open_position,
children,
close_content,
close_position,
info,
..
} => {
let mut ds = f.debug_struct(&format!(
"List id:{} content:{}",
self.id(),
self.content_id()
));
ds.field("open_content", &open_content)
.field("open_position", &dbg_pos(open_position))
.field("children", &children)
.field("close_content", &close_content)
.field("close_position", &dbg_pos(close_position));
if env::var("DFT_VERBOSE").is_ok() {
ds.field("change", &info.change.get());
let next_sibling_s = match info.next_sibling.get() {
Some(List { .. }) => "Some(List)",
Some(Atom { .. }) => "Some(Atom)",
None => "None",
};
ds.field("next_sibling", &next_sibling_s);
}
ds.finish()
}
Atom {
content,
position,
info,
kind: highlight,
..
} => {
let mut ds = f.debug_struct(&format!(
"Atom id:{} content:{}",
self.id(),
self.content_id()
));
ds.field("content", &content);
ds.field("position", &dbg_pos(position));
if env::var("DFT_VERBOSE").is_ok() {
ds.field("highlight", highlight);
ds.field("change", &info.change.get());
let next_sibling_s = match info.next_sibling.get() {
Some(List { .. }) => "Some(List)",
Some(Atom { .. }) => "Some(Atom)",
None => "None",
};
ds.field("next_sibling", &next_sibling_s);
}
ds.finish()
}
}
}
}
impl<'a> Syntax<'a> {
pub fn new_list(
arena: &'a Arena>,
open_content: &str,
open_position: Vec,
children: Vec<&'a Syntax<'a>>,
close_content: &str,
close_position: Vec,
) -> &'a Syntax<'a> {
// Skip empty atoms: they aren't displayed, so there's no
// point making our syntax tree bigger. These occur when we're
// parsing incomplete or malformed programs.
let children = children
.into_iter()
.filter(|n| match n {
List { .. } => true,
Atom { content, .. } => !content.is_empty(),
})
.collect::>();
// Don't bother creating a list if we have no open/close and
// there's only one child. This occurs in small files with
// thorough tree-sitter parsers: you get parse trees like:
//
// (compilation-unit (top-level-def (function ...)))
//
// This is a small performance win as it makes the difftastic
// syntax tree smaller. It also really helps when looking at
// debug output for small inputs.
if children.len() == 1 && open_content.is_empty() && close_content.is_empty() {
return children[0];
}
let mut num_descendants = 0;
for child in &children {
num_descendants += match child {
List {
num_descendants, ..
} => *num_descendants + 1,
Atom { .. } => 1,
};
}
arena.alloc(List {
info: SyntaxInfo::default(),
open_position,
open_content: open_content.into(),
close_content: close_content.into(),
close_position,
children,
num_descendants,
})
}
pub fn new_atom(
arena: &'a Arena>,
position: Vec,
content: &str,
kind: AtomKind,
) -> &'a Syntax<'a> {
arena.alloc(Atom {
info: SyntaxInfo::default(),
position,
content: content.into(),
kind,
})
}
pub fn info(&self) -> &SyntaxInfo<'a> {
match self {
List { info, .. } | Atom { info, .. } => info,
}
}
pub fn parent(&self) -> Option<&'a Syntax<'a>> {
self.info().parent.get()
}
pub fn next_sibling(&self) -> Option<&'a Syntax<'a>> {
self.info().next_sibling.get()
}
pub fn prev_is_contiguous(&self) -> bool {
self.info().prev_is_contiguous.get()
}
/// A unique ID of this syntax node. Every node is guaranteed to
/// have a different value.
pub fn id(&self) -> u32 {
self.info().unique_id.get()
}
/// A content ID of this syntax node. Two nodes have the same
/// content ID if they have the same content, regardless of
/// position.
pub fn content_id(&self) -> u32 {
self.info().content_id.get()
}
pub fn num_ancestors(&self) -> u32 {
self.info().num_ancestors.get()
}
pub fn dbg_content(&self) -> String {
match self {
List {
open_content,
close_content,
..
} => format!("{} ... {}", open_content, close_content),
Atom { content, .. } => content.into(),
}
}
pub fn first_line(&self) -> Option {
let position = match self {
List { open_position, .. } => open_position,
Atom { position, .. } => position,
};
position.first().map(|lp| lp.line)
}
pub fn last_line(&self) -> Option {
let position = match self {
List { close_position, .. } => close_position,
Atom { position, .. } => position,
};
position.last().map(|lp| lp.line)
}
pub fn change(&'a self) -> Option> {
self.info().change.get()
}
pub fn set_change(&self, ck: ChangeKind<'a>) {
self.info().change.set(Some(ck));
}
pub fn set_change_deep(&self, ck: ChangeKind<'a>) {
self.set_change(ck);
if let List { children, .. } = self {
// For unchanged lists, match up children with the
// unchanged children on the other side.
if let Unchanged(List {
children: other_children,
..
}) = ck
{
for (child, other_child) in children.iter().zip(other_children) {
child.set_change_deep(Unchanged(other_child));
}
} else {
for child in children {
child.set_change_deep(ck);
}
};
}
}
}
/// Initialise all the fields in `SyntaxInfo`.
pub fn init_all_info<'a>(lhs_roots: &[&'a Syntax<'a>], rhs_roots: &[&'a Syntax<'a>]) {
init_info(lhs_roots, rhs_roots);
init_next_prev(lhs_roots);
init_next_prev(rhs_roots);
}
pub fn init_info<'a>(lhs_roots: &[&'a Syntax<'a>], rhs_roots: &[&'a Syntax<'a>]) {
// TODO: start unique IDs at 1. This helps with hashing and makes
// it obvious if we've forgotten to init the syntax info.
let mut id = 0;
init_info_single(lhs_roots, &mut id);
init_info_single(rhs_roots, &mut id);
let mut existing = HashMap::new();
set_content_id(lhs_roots, &mut existing);
set_content_id(rhs_roots, &mut existing);
}
type ContentKey = (Option, Option, Vec, bool, bool);
fn set_content_id(nodes: &[&Syntax], existing: &mut HashMap) {
for node in nodes {
let key: ContentKey = match node {
List {
open_content,
close_content,
children,
..
} => {
// Recurse first, so children all have their content_id set.
set_content_id(children, existing);
let children_content_ids: Vec<_> =
children.iter().map(|c| c.info().content_id.get()).collect();
(
Some(open_content.clone()),
Some(close_content.clone()),
children_content_ids,
true,
true,
)
}
Atom {
content,
kind: highlight,
..
} => {
let is_comment = *highlight == AtomKind::Comment;
let clean_content = if is_comment && content.lines().count() > 1 {
content
.lines()
.map(|l| l.trim_start())
.collect::>()
.join("\n")
.to_string()
} else {
content.clone()
};
(Some(clean_content), None, vec![], false, is_comment)
}
};
// Ensure the ID is always greater than zero, so we can
// distinguish an uninitialised SyntaxInfo value.
let next_id = existing.len() as u32 + 1;
let content_id = existing.entry(key).or_insert(next_id);
node.info().content_id.set(*content_id);
}
}
pub fn init_next_prev<'a>(roots: &[&'a Syntax<'a>]) {
set_next_sibling(roots);
set_prev(roots, None);
set_prev_is_contiguous(roots);
}
fn init_info_single<'a>(roots: &[&'a Syntax<'a>], next_id: &mut u32) {
set_parent(roots, None);
set_num_ancestors(roots, 0);
set_unique_id(roots, next_id)
}
fn set_unique_id<'a>(nodes: &[&'a Syntax<'a>], next_id: &mut u32) {
for node in nodes {
node.info().unique_id.set(*next_id);
*next_id += 1;
if let List { children, .. } = node {
set_unique_id(children, next_id);
}
}
}
fn set_next_sibling<'a>(nodes: &[&'a Syntax<'a>]) {
for (i, node) in nodes.iter().enumerate() {
let sibling = nodes.get(i + 1).copied();
node.info().next_sibling.set(sibling);
if let List { children, .. } = node {
set_next_sibling(children);
}
}
}
/// For every syntax node in the tree, mark the previous node
/// according to a preorder traversal.
fn set_prev<'a>(nodes: &[&'a Syntax<'a>], parent: Option<&'a Syntax<'a>>) {
for (i, node) in nodes.iter().enumerate() {
let node_prev = if i == 0 { parent } else { Some(nodes[i - 1]) };
node.info().prev.set(node_prev);
if let List { children, .. } = node {
set_prev(children, Some(node));
}
}
}
fn set_parent<'a>(nodes: &[&'a Syntax<'a>], parent: Option<&'a Syntax<'a>>) {
for node in nodes {
node.info().parent.set(parent);
if let List { children, .. } = node {
set_parent(children, Some(node));
}
}
}
fn set_num_ancestors<'a>(nodes: &[&Syntax<'a>], num_ancestors: u32) {
for node in nodes {
node.info().num_ancestors.set(num_ancestors);
if let List { children, .. } = node {
set_num_ancestors(children, num_ancestors + 1);
}
}
}
fn set_prev_is_contiguous<'a>(roots: &[&Syntax<'a>]) {
for node in roots {
let is_contiguous = if let Some(prev) = node.info().prev.get() {
match prev {
List {
open_position,
close_position,
..
} => {
let prev_is_parent = prev.num_ancestors() < node.num_ancestors();
if prev_is_parent {
open_position.last().map(|p| p.line) == node.first_line()
} else {
// predecessor node at the same level.
close_position.last().map(|p| p.line) == node.first_line()
}
}
Atom { .. } => prev.last_line() == node.first_line(),
}
} else {
false
};
node.info().prev_is_contiguous.set(is_contiguous);
if let List { children, .. } = node {
set_prev_is_contiguous(children);
}
}
}
impl<'a> PartialEq for Syntax<'a> {
fn eq(&self, other: &Self) -> bool {
debug_assert!(self.content_id() > 0);
debug_assert!(other.content_id() > 0);
self.content_id() == other.content_id()
}
}
impl<'a> Eq for Syntax<'a> {}
impl<'a> Hash for Syntax<'a> {
fn hash(&self, state: &mut H) {
self.content_id().hash(state);
}
}
#[derive(PartialEq, Eq, Debug, Clone, Copy, Hash)]
pub enum AtomKind {
Normal,
String,
Type,
Comment,
Keyword,
}
/// Unlike atoms, tokens can be delimiters like `{`.
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub enum TokenKind {
Delimiter,
Atom(AtomKind),
}
/// A matched token (an atom, a delimiter, or a comment word).
#[derive(PartialEq, Eq, Debug, Clone)]
pub enum MatchKind {
UnchangedToken {
highlight: TokenKind,
self_pos: Vec,
opposite_pos: Vec,
},
Novel {
highlight: TokenKind,
},
NovelLinePart {
highlight: TokenKind,
self_pos: SingleLineSpan,
opposite_pos: Vec,
},
NovelWord {
highlight: TokenKind,
},
}
impl MatchKind {
pub fn first_opposite_span(&self) -> Option {
match self {
MatchKind::UnchangedToken { opposite_pos, .. } => opposite_pos.first().copied(),
MatchKind::NovelLinePart { opposite_pos, .. } => opposite_pos.first().copied(),
MatchKind::Novel { .. } => None,
MatchKind::NovelWord { .. } => None,
}
}
pub fn is_change(&self) -> bool {
matches!(
self,
MatchKind::Novel { .. } | MatchKind::NovelWord { .. } | MatchKind::NovelLinePart { .. }
)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct MatchedPos {
pub kind: MatchKind,
pub pos: SingleLineSpan,
}
/// Split `s` into a vec of things that look like words and individual
/// non-word characters.
///
/// "foo bar" -> vec!["foo", " ", "bar"]
pub fn split_words(s: &str) -> Vec<&str> {
let mut res = vec![];
let mut word_start = None;
for (idx, c) in s.char_indices() {
if c.is_alphanumeric() {
if word_start.is_none() {
word_start = Some(idx);
}
} else {
if let Some(start) = word_start {
res.push(&s[start..idx]);
word_start = None;
}
res.push(&s[idx..idx + c.len_utf8()]);
}
}
if let Some(start) = word_start {
res.push(&s[start..]);
}
res
}
fn split_comment_words(
content: &str,
pos: SingleLineSpan,
opposite_content: &str,
opposite_pos: SingleLineSpan,
) -> Vec {
// TODO: merge adjacent single-line comments unless there are
// blank lines between them.
let content_parts = split_words(content);
let other_parts = split_words(opposite_content);
let content_newlines = NewlinePositions::from(content);
let opposite_content_newlines = NewlinePositions::from(opposite_content);
let mut offset = 0;
let mut opposite_offset = 0;
let mut res = vec![];
for diff_res in myers_diff::slice(&content_parts, &other_parts) {
match diff_res {
myers_diff::DiffResult::Left(word) => {
// This word is novel to this side.
res.push(MatchedPos {
kind: MatchKind::NovelWord {
highlight: TokenKind::Atom(AtomKind::Comment),
},
pos: content_newlines.from_offsets_relative_to(
pos,
offset,
offset + word.len(),
)[0],
});
offset += word.len();
}
myers_diff::DiffResult::Both(word, opposite_word) => {
// This word is present on both sides.
let word_pos =
content_newlines.from_offsets_relative_to(pos, offset, offset + word.len())[0];
let opposite_word_pos = opposite_content_newlines.from_offsets_relative_to(
opposite_pos,
opposite_offset,
opposite_offset + opposite_word.len(),
);
res.push(MatchedPos {
kind: MatchKind::NovelLinePart {
highlight: TokenKind::Atom(AtomKind::Comment),
self_pos: word_pos,
opposite_pos: opposite_word_pos,
},
pos: word_pos,
});
offset += word.len();
opposite_offset += opposite_word.len();
}
myers_diff::DiffResult::Right(opposite_word) => {
// Only exists on other side, nothing to do on this side.
opposite_offset += opposite_word.len();
}
}
}
res
}
impl MatchedPos {
fn new(
ck: ChangeKind,
highlight: TokenKind,
pos: &[SingleLineSpan],
is_close: bool,
) -> Vec {
let kind = match ck {
ReplacedComment(this, opposite) => {
let this_content = match this {
List { .. } => unreachable!(),
Atom { content, .. } => content,
};
let (opposite_content, opposite_pos) = match opposite {
List { .. } => unreachable!(),
Atom {
content, position, ..
} => (content, position),
};
return split_comment_words(
this_content,
// TODO: handle the whole pos here.
pos[0],
opposite_content,
opposite_pos[0],
);
}
Unchanged(opposite) => {
let opposite_pos = match opposite {
List {
open_position,
close_position,
..
} => {
if is_close {
close_position.clone()
} else {
open_position.clone()
}
}
Atom { position, .. } => position.clone(),
};
MatchKind::UnchangedToken {
highlight,
self_pos: pos.to_vec(),
opposite_pos,
}
}
Novel => MatchKind::Novel { highlight },
};
// Create a MatchedPos for every line that `pos` covers.
// TODO: what about opposite pos?
let mut res = vec![];
for line_pos in pos {
// Don't create a MatchedPos for empty positions. This
// occurs when we have lists with empty open/close
// delimiter positions, such as the top-level list of syntax items.
if line_pos.start_col == line_pos.end_col {
continue;
}
res.push(Self {
kind: kind.clone(),
pos: *line_pos,
});
}
res
}
}
/// Walk `nodes` and return a vec of all the changed positions.
pub fn change_positions<'a>(nodes: &[&'a Syntax<'a>]) -> Vec {
let mut positions = Vec::new();
change_positions_(nodes, &mut positions);
positions
}
fn change_positions_<'a>(nodes: &[&'a Syntax<'a>], positions: &mut Vec) {
for node in nodes {
let change = node
.change()
.unwrap_or_else(|| panic!("Should have changes set in all nodes: {:#?}", node));
match node {
List {
open_position,
children,
close_position,
..
} => {
positions.extend(MatchedPos::new(
change,
TokenKind::Delimiter,
open_position,
false,
));
change_positions_(children, positions);
positions.extend(MatchedPos::new(
change,
TokenKind::Delimiter,
close_position,
true,
));
}
Atom { position, kind, .. } => {
positions.extend(MatchedPos::new(
change,
TokenKind::Atom(*kind),
position,
false,
));
}
}
}
}
pub fn zip_pad_shorter(
lhs: &[Tx],
rhs: &[Ty],
) -> Vec<(Option, Option)> {
let mut res = vec![];
let mut i = 0;
loop {
match (lhs.get(i), rhs.get(i)) {
(None, None) => break,
(x, y) => res.push((x.cloned(), y.cloned())),
}
i += 1;
}
res
}
/// Zip `lhs` with `rhs`, but repeat the last item from the shorter
/// slice.
pub fn zip_repeat_shorter(lhs: &[Tx], rhs: &[Ty]) -> Vec<(Tx, Ty)> {
let lhs_last: Tx = match lhs.last() {
Some(last) => last.clone(),
None => return vec![],
};
let rhs_last: Ty = match rhs.last() {
Some(last) => last.clone(),
None => return vec![],
};
let mut res = vec![];
let mut i = 0;
loop {
match (lhs.get(i), rhs.get(i)) {
(None, None) => break,
(x, y) => res.push((
x.cloned().unwrap_or_else(|| lhs_last.clone()),
y.cloned().unwrap_or_else(|| rhs_last.clone()),
)),
}
i += 1;
}
res
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
/// Consider comment atoms as distinct to other atoms even if the
/// content matches otherwise.
#[test]
fn test_comment_and_atom_differ() {
let pos = vec![SingleLineSpan {
line: 0.into(),
start_col: 2,
end_col: 3,
}];
let arena = Arena::new();
let comment = Syntax::new_atom(&arena, pos.clone(), "foo", AtomKind::Comment);
let atom = Syntax::new_atom(&arena, pos, "foo", AtomKind::Normal);
init_all_info(&[comment], &[atom]);
assert_ne!(comment, atom);
}
/// Ignore the syntax highighting kind when comparing
/// atoms. Sometimes changing delimiter wrapping can change
/// whether a parser thinks that a node is e.g. a type.
#[test]
fn test_atom_equality_ignores_highlighting() {
let pos = vec![SingleLineSpan {
line: 0.into(),
start_col: 2,
end_col: 3,
}];
let arena = Arena::new();
let type_atom = Syntax::new_atom(&arena, pos.clone(), "foo", AtomKind::Type);
let atom = Syntax::new_atom(&arena, pos, "foo", AtomKind::Normal);
init_all_info(&[type_atom], &[atom]);
assert_eq!(type_atom, atom);
}
#[test]
fn test_flatten_trivial_list() {
let pos = vec![SingleLineSpan {
line: 0.into(),
start_col: 2,
end_col: 3,
}];
let arena = Arena::new();
let atom = Syntax::new_atom(&arena, pos, "foo", AtomKind::Normal);
let trivial_list = Syntax::new_list(&arena, "", vec![], vec![atom], "", vec![]);
assert!(matches!(trivial_list, Atom { .. }));
}
#[test]
fn test_ignore_empty_atoms() {
let pos = vec![SingleLineSpan {
line: 0.into(),
start_col: 2,
end_col: 2,
}];
let arena = Arena::new();
let atom = Syntax::new_atom(&arena, pos, "", AtomKind::Normal);
let trivial_list = Syntax::new_list(&arena, "(", vec![], vec![atom], ")", vec![]);
match trivial_list {
List { children, .. } => {
assert_eq!(children.len(), 0);
}
Atom { .. } => unreachable!(),
}
}
#[test]
fn test_multiline_comment_ignores_leading_whitespace() {
let pos = vec![SingleLineSpan {
line: 0.into(),
start_col: 2,
end_col: 3,
}];
let arena = Arena::new();
let x = Syntax::new_atom(&arena, pos.clone(), "foo\nbar", AtomKind::Comment);
let y = Syntax::new_atom(&arena, pos, "foo\n bar", AtomKind::Comment);
init_all_info(&[x], &[y]);
assert_eq!(x, y);
}
#[test]
fn test_atom_equality_ignores_change() {
let lhs = Atom {
info: SyntaxInfo {
change: Cell::new(Some(Novel)),
..SyntaxInfo::default()
},
position: vec![SingleLineSpan {
line: 1.into(),
start_col: 2,
end_col: 3,
}],
content: "foo".into(),
kind: AtomKind::Normal,
};
let rhs = Atom {
info: SyntaxInfo {
change: Cell::new(None),
..SyntaxInfo::default()
},
position: vec![SingleLineSpan {
line: 1.into(),
start_col: 2,
end_col: 3,
}],
content: "foo".into(),
kind: AtomKind::Normal,
};
init_all_info(&[&lhs], &[&rhs]);
assert_eq!(lhs, rhs);
}
#[test]
fn test_split_comment_words_basic() {
let content = "abc";
let pos = SingleLineSpan {
line: 0.into(),
start_col: 0,
end_col: 3,
};
let opposite_content = "def";
let opposite_pos = SingleLineSpan {
line: 0.into(),
start_col: 0,
end_col: 3,
};
let res = split_comment_words(content, pos, opposite_content, opposite_pos);
assert_eq!(
res,
vec![MatchedPos {
kind: MatchKind::NovelWord {
highlight: TokenKind::Atom(AtomKind::Comment),
},
pos: SingleLineSpan {
line: 0.into(),
start_col: 0,
end_col: 3
},
},]
);
}
#[test]
fn test_split_words() {
let s = "example.com";
let res = split_words(s);
assert_eq!(res, vec!["example", ".", "com"])
}
#[test]
fn test_split_words_punctuations() {
let s = "example..";
let res = split_words(s);
assert_eq!(res, vec!["example", ".", "."])
}
#[test]
fn test_split_words_treats_newline_separately() {
let s = "example.\ncom";
let res = split_words(s);
assert_eq!(res, vec!["example", ".", "\n", "com"])
}
#[test]
fn test_split_words_single_unicode() {
let s = "a ö b";
let res = split_words(s);
assert_eq!(res, vec!["a", " ", "ö", " ", "b"])
}
#[test]
fn test_split_words_single_unicode_not_alphabetic() {
let s = "a 💝 b";
let res = split_words(s);
assert_eq!(res, vec!["a", " ", "💝", " ", "b"])
}
#[test]
fn test_split_words_unicode() {
let s = "a xöy b";
let res = split_words(s);
assert_eq!(res, vec!["a", " ", "xöy", " ", "b"])
}
}