use std::fs; use fake::{Fake, Faker}; use itertools::Itertools; use more_asserts::assert_le; #[cfg(feature = "markdown")] use text_splitter::{ChunkConfig, MarkdownSplitter}; #[cfg(feature = "markdown")] #[test] fn random_chunk_size() { let text = fs::read_to_string("tests/inputs/markdown/github_flavored.md").unwrap(); for _ in 0..10 { let max_characters = Faker.fake(); let splitter = MarkdownSplitter::new(ChunkConfig::new(max_characters).with_trim(false)); let chunks = splitter.chunks(&text).collect::>(); assert_eq!(chunks.join(""), text); for chunk in chunks { assert_le!(chunk.chars().count(), max_characters); } } } #[cfg(feature = "markdown")] #[test] fn random_chunk_indices_increase() { let text = fs::read_to_string("tests/inputs/markdown/github_flavored.md").unwrap(); for _ in 0..10 { let max_characters = Faker.fake::(); let splitter = MarkdownSplitter::new(ChunkConfig::new(max_characters).with_trim(false)); let indices = splitter.chunk_indices(&text).map(|(i, _)| i); assert!(indices.tuple_windows().all(|(a, b)| a < b)); } } #[cfg(feature = "markdown")] #[test] fn fallsback_to_normal_text_split_if_no_markdown_content() { let chunk_config = ChunkConfig::new(10).with_trim(false); let splitter = MarkdownSplitter::new(chunk_config); let text = "Some text\n\nfrom a\ndocument"; let chunks = splitter.chunks(text).collect::>(); assert_eq!(["Some text\n", "\nfrom a\n", "document"].to_vec(), chunks); } #[cfg(feature = "markdown")] #[test] fn split_by_rule() { let splitter = MarkdownSplitter::new(ChunkConfig::new(12).with_trim(false)); let text = "Some text\n\n---\n\nwith a rule"; let chunks = splitter.chunks(text).collect::>(); assert_eq!(["Some text\n\n", "---\n", "\nwith a rule"].to_vec(), chunks); } #[cfg(feature = "markdown")] #[test] fn split_by_rule_trim() { let splitter = MarkdownSplitter::new(12); let text = "Some text\n\n---\n\nwith a rule"; let chunks = splitter.chunks(text).collect::>(); assert_eq!(["Some text", "---", "with a rule"].to_vec(), chunks); } #[cfg(feature = "markdown")] #[test] fn split_by_headers() { let splitter = MarkdownSplitter::new(ChunkConfig::new(30).with_trim(false)); let text = "# Header 1\n\nSome text\n\n## Header 2\n\nwith headings\n"; let chunks = splitter.chunks(text).collect::>(); assert_eq!( [ "# Header 1\n\nSome text\n\n", "## Header 2\n\nwith headings\n" ] .to_vec(), chunks ); } #[cfg(feature = "markdown")] #[test] fn subheadings_grouped_with_top_header() { let splitter = MarkdownSplitter::new(ChunkConfig::new(60).with_trim(false)); let text = "# Header 1\n\nSome text\n\n## Header 2\n\nwith headings\n\n### Subheading\n\nand more text\n"; let chunks = splitter.chunks(text).collect::>(); assert_eq!( [ "# Header 1\n\nSome text\n\n", "## Header 2\n\nwith headings\n\n### Subheading\n\nand more text\n" ] .to_vec(), chunks ); } #[cfg(feature = "markdown")] #[test] fn trimming_doesnt_trim_block_level_indentation_if_multiple_items() { let splitter = MarkdownSplitter::new(48); let text = "* Really long list item that is too big to fit\n\n * Some Indented Text\n\n * More Indented Text\n\n"; let chunks = splitter.chunks(text).collect::>(); assert_eq!( [ "* Really long list item that is too big to fit", " * Some Indented Text\n\n * More Indented Text" ] .to_vec(), chunks ); } #[cfg(feature = "markdown")] #[test] fn trimming_does_trim_block_level_indentation_if_only_one_item() { let splitter = MarkdownSplitter::new(30); let text = "1. Really long list item\n\n 1. Some Indented Text\n\n 2. More Indented Text\n\n"; let chunks = splitter.chunks(text).collect::>(); assert_eq!( [ "1. Really long list item", "1. Some Indented Text", "2. More Indented Text" ] .to_vec(), chunks ); }