/*!
* A dictionary search tool.
*
* Copyright (C) 2023-2024 kaoru
*/
use std::cmp::min;
use std::env;
use std::fs::File;
use std::io::{stdin, Read};
use std::path::Path;
use std::process::exit;
use anyhow::Result;
use tetengo_trie::{MemoryStorage, Trie, ValueDeserializer};
fn main() {
if let Err(e) = main_core() {
eprintln!("Error: {}", e);
exit(1);
}
}
fn main_core() -> Result<()> {
let args = env::args().collect::>();
if args.len() <= 2 {
eprintln!("Usage: search_dict UniDic_lex.csv trie.bin");
return Ok(());
}
let lex_csv = load_lex_csv(Path::new(&args[1]))?;
let trie = load_trie(Path::new(&args[2]))?;
loop {
eprint!(">> ");
let mut line = String::new();
let read_length = stdin().read_line(&mut line)?;
if read_length == 0 {
break;
}
if line.is_empty() {
continue;
}
line = line.trim_end().to_string();
let found = match trie.find(&line)? {
Some(found) => found,
None => {
println!("ERROR: Not found.");
continue;
}
};
found.iter().for_each(|e| {
let (offset, length) = *e;
print!("{}", substring_view(&lex_csv, offset, length));
});
}
Ok(())
}
#[derive(Debug, thiserror::Error)]
enum DictSearchingError {
#[error("Can't read the whole of lex.csv file.")]
CantReadWholeOfLexCsvFile,
}
fn load_lex_csv(lex_csv_path: &Path) -> Result {
let mut file = File::open(lex_csv_path)?;
let lex_csv_size = file.metadata()?.len();
let mut buffer = String::new();
let read_length = file.read_to_string(&mut buffer)?;
if read_length != lex_csv_size as usize {
return Err(DictSearchingError::CantReadWholeOfLexCsvFile.into());
}
Ok(buffer)
}
type DictTrie = Trie>;
fn load_trie(trie_path: &Path) -> Result {
let mut file = File::open(trie_path)?;
let mut value_deserializer = ValueDeserializer::new(Box::new(deserialize_value));
let storage = Box::new(MemoryStorage::new_with_reader(
&mut file,
&mut value_deserializer,
)?);
let trie = DictTrie::builder_with_storage(storage).build();
Ok(trie)
}
const VALUE_CAPACITY: usize = 4usize;
fn deserialize_value(bytes: &[u8]) -> Result> {
let mut byte_offset = 0usize;
let size = deserialize_usize(bytes, &mut byte_offset)?;
let mut vps = Vec::with_capacity(size);
for _ in 0..min(size, VALUE_CAPACITY) {
vps.push(deserialize_pair_of_usize(bytes, &mut byte_offset)?);
}
(VALUE_CAPACITY..size).for_each(|_| {
vps.push((0, 0));
});
Ok(vps)
}
fn deserialize_pair_of_usize(bytes: &[u8], byte_offset: &mut usize) -> Result<(usize, usize)> {
let first = deserialize_usize(bytes, byte_offset)?;
let second = deserialize_usize(bytes, byte_offset)?;
Ok((first, second))
}
fn deserialize_usize(bytes: &[u8], byte_offset: &mut usize) -> Result {
let mut value = 0usize;
(0..size_of::()).for_each(|i| {
value <<= 8;
value |= bytes[*byte_offset + i] as usize;
});
*byte_offset += size_of::();
Ok(value)
}
fn substring_view(sv: &str, offset: usize, length: usize) -> &str {
if offset == 0 && length == 0 {
return "(truncated)\n";
}
&sv[offset..offset + length]
}