/*! * A dictionary building tool. * * Copyright (C) 2023-2024 kaoru */ use std::collections::HashMap; use std::env; use std::fs::File; use std::io::{BufRead, BufReader, BufWriter}; use std::path::Path; use std::process::exit; use anyhow::Result; use tetengo_trie::{BuldingObserverSet, Serializer, StringSerializer, Trie, ValueSerializer}; fn main() { if let Err(e) = main_core() { eprintln!("Error: {}", e); exit(1); } } fn main_core() -> Result<()> { let args = env::args().collect::>(); if args.len() <= 2 { eprintln!("Usage: make_dict UniDic_lex.csv trie.bin"); return Ok(()); } let word_offset_map = load_lex_csv(Path::new(&args[1]))?; let trie = build_trie(word_offset_map)?; serialize_trie(&trie, Path::new(&args[2]))?; Ok(()) } #[derive(Debug, thiserror::Error)] enum DictMakingError { #[error("Invalid UniDic lex.csv format.")] InvalidUnidicLexCsvFormat, } type WordOffsetMap = HashMap>; fn load_lex_csv(lex_csv_path: &Path) -> Result { let file = File::open(lex_csv_path)?; let mut word_offset_map = WordOffsetMap::new(); eprintln!("Loading UniDic lex.csv..."); let mut line_head = 0usize; let buf_reader = BufReader::new(file); for (i, line) in buf_reader.lines().enumerate() { let Ok(line) = line else { eprintln!("{:8}: Can't read this line.", i); return Err(DictMakingError::InvalidUnidicLexCsvFormat.into()); }; if line.is_empty() { line_head += line.len() + 1; continue; } let elements = split(&line, ','); if elements.len() != 33 { eprintln!("{:8}: {}", i, elements[0]); return Err(DictMakingError::InvalidUnidicLexCsvFormat.into()); } if elements[16] == "記号" && elements[23] == "補助" { insert_word_offset_to_map(elements[0], line_head, line.len() + 1, &mut word_offset_map); } else { insert_word_offset_to_map( elements[12], line_head, line.len() + 1, &mut word_offset_map, ); insert_word_offset_to_map( elements[24], line_head, line.len() + 1, &mut word_offset_map, ); } if i % 10000 == 0 { eprint!("{:8}: {} \r", i, elements[0]); } line_head += line.len() + 1; } eprintln!("Done. "); Ok(word_offset_map) } fn split(string: &str, delimiter: char) -> Vec<&str> { let mut elements = Vec::new(); let mut first = 0usize; loop { if first < string.len() && string[first..].starts_with('"') { if let Some(length) = string[first + 1..].find('"') { let last = first + 1 + length; elements.push(&string[first + 1..last]); debug_assert!(string[last + 1..].starts_with(delimiter)); first = last + 2; } else { elements.push(&string[first + 1..]); break; } } else if let Some(length) = string[first..].find(delimiter) { let last = first + length; elements.push(&string[first..last]); first = last + 1; } else { elements.push(&string[first..]); break; } } elements } const VALUE_CAPACITY: usize = 4usize; fn insert_word_offset_to_map(key: &str, offset: usize, length: usize, map: &mut WordOffsetMap) { let value = map.entry(key.to_string()).or_default(); if value.iter().any(|&(o, l)| o == offset && l == length) { return; } if value.len() < VALUE_CAPACITY { value.push((offset, length)); } else { value.push((0, 0)); } } type DictTrie = Trie>; fn build_trie(word_offset_map: WordOffsetMap) -> Result { eprintln!("Building trie..."); let mut word_offset_vector = word_offset_map.into_iter().collect::>(); word_offset_vector.sort(); let mut index = 0usize; let trie = DictTrie::builder() .elements(word_offset_vector) .key_serializer(StringSerializer::new(true)) .build_with_observer_set(&mut BuldingObserverSet::new( &mut |key| { if index % 10000 == 0 { eprint!("{:8}: {} \r", index, String::from_utf8_lossy(key)); } index += 1; }, &mut || {}, )); eprintln!("Done. "); trie } const SERIALIZED_VALUE_SIZE: usize = size_of::() * (1 + 4 * 2); fn serialize_trie(trie: &DictTrie, trie_bin_path: &Path) -> Result<()> { eprintln!("Serializing trie..."); let file = File::create(trie_bin_path)?; let mut buf_writer = BufWriter::new(file); let mut serializer = ValueSerializer::new(Box::new(serialize_value), SERIALIZED_VALUE_SIZE); trie.storage().serialize(&mut buf_writer, &mut serializer)?; eprintln!("Done. "); Ok(()) } #[allow(clippy::ptr_arg)] fn serialize_value(vpus: &Vec<(usize, usize)>) -> Vec { let mut serialized = Vec::with_capacity(SERIALIZED_VALUE_SIZE); let serialized_size = serialize_usize(vpus.len()); serialized.extend(serialized_size); (0..VALUE_CAPACITY).for_each(|i| { if i < vpus.len() { let serialized_element = serialize_pair_of_usize(&vpus[i]); serialized.extend(serialized_element); } else { let serialized_element = serialize_pair_of_usize(&(0, 0)); serialized.extend(serialized_element); } }); serialized } fn serialize_pair_of_usize(pus: &(usize, usize)) -> Vec { let mut serialized = Vec::with_capacity(size_of::() * 2); let (offset, length) = pus; serialized.extend(serialize_usize(*offset)); serialized.extend(serialize_usize(*length)); serialized } fn serialize_usize(us: usize) -> Vec { debug_assert!(us <= u32::MAX as usize); let mut serialized = Vec::from([0u8; size_of::()]); (0..size_of::()).for_each(|i| { serialized[i] = ((us >> ((size_of::() - i - 1) * 8)) & 0xFF) as u8; }); serialized }