//! The chunker is responsible for dividing objects into chunks for //! deduplication and storage //! //! The chunker uses a rolling hash (currently a derivitive of buszhash) to //! perform content based slicing. Any single byte change in the object should, //! ideally, only result in having to re-write the chunk it is in, and maybe the //! one after it. It should not affect deduplication for the rest of the file. //! In practice, this can sometimes get confounded by the minimum and maxiumum //! chunk sizes. //! //! The file is sliced by rolling each slice through the hasher, and generating //! splits when the last mask_bits of the resulting hash are all zero. //! //! This results in a 2^mask_bits byte statistical slice length, minimum and //! maximum chunk sizes are set at 2^(mask_bits-2) bytes and 2^(mask_bits+2) //! bytes, respectivly, to prevent overly large or small slices, and to provide //! some measure of predictibility. use rand::prelude::*; use std::collections::VecDeque; use std::io::Read; #[cfg(feature = "profile")] use flamer::*; /// Stores the data in a slice/chunk, as well as providing information about /// the location of the slice in the file. pub struct Slice { pub data: Vec, pub start: u64, pub end: u64, } /// Apply content based slicing over a Read, and iterate throught the slices /// /// Slicing is applied as the iteration proceeds, so each byte is only read once pub struct IteratedReader { /// Internal Reader reader: R, /// Hasher hasher: BuzHash, /// Hash Mask mask: u64, /// Minimum chunk size min_size: usize, /// Maximum chunk size max_size: usize, /// read buffer buffer: [u8; 8192], /// location of the cursor in the buffer cursor: usize, /// length of the data in the buffer buffer_len: usize, /// location of the cursor in the file location: usize, /// Flag for if end of file reached finished: bool, } impl Iterator for IteratedReader { type Item = Slice; #[cfg_attr(feature = "profile", flame)] fn next(&mut self) -> Option { if self.finished { None } else { let start = self.location; let mut end = self.location; let mut output = Vec::::new(); let hasher = &mut self.hasher; hasher.reset(); let mut split = false; while !split { if self.cursor < self.buffer_len { let byte = self.buffer[self.cursor]; output.push(byte); let hash = hasher.hash_byte(byte); let len = output.len(); if (hash & self.mask) == 0 && (len >= self.min_size) && (len <= self.max_size) { split = true; end = self.location; } self.location += 1; self.cursor += 1; } else { self.cursor = 0; let result = self.reader.read(&mut self.buffer); match result { Err(_) => { split = true; end = self.location; self.finished = true; } Ok(0) => { split = true; end = self.location; self.finished = true; } Ok(n) => { self.buffer_len = n; } } } } Some(Slice { data: output, start: start as u64, end: end as u64, }) } } } /// Stores chunker settings for easy reuse #[derive(Clone)] pub struct Chunker { /// Hash Mask mask: u64, /// Hasher hasher: BuzHash, /// Mask bits count mask_bits: u32, } impl Chunker { /// Creates a new chunker with the given window and mask bits pub fn new(window: u64, mask_bits: u32, nonce: u64) -> Chunker { Chunker { mask: 2_u64.pow(mask_bits) - 1, hasher: BuzHash::new(nonce, window as u32), mask_bits, } } /// Produces an iterator over the slices in a file /// /// Will make a copy of the internal hashser pub fn chunked_iterator(&self, reader: R) -> IteratedReader { IteratedReader { reader, hasher: self.hasher.clone(), mask: self.mask, min_size: 2_usize.pow(self.mask_bits - 2), max_size: 2_usize.pow(self.mask_bits + 2), buffer: [0_u8; 8192], cursor: 0, buffer_len: 0, location: 0, finished: false, } } } #[derive(Clone)] pub struct BuzHash { hash: u64, table: [u64; 256], window_size: u32, buffer: VecDeque, count: u32, } impl BuzHash { pub fn new(nonce: u64, window_size: u32) -> BuzHash { let mut table = [0_u64; 256]; let mut rand = SmallRng::seed_from_u64(nonce); for i in table.iter_mut() { *i = rand.gen(); } BuzHash { hash: 0, table, window_size, buffer: VecDeque::with_capacity(window_size as usize), count: 0, } } pub fn hash_byte(&mut self, byte: u8) -> u64 { // Determine if removal is needed if self.count >= self.window_size { let hash = self.hash.rotate_left(1); let head = self.buffer.pop_front().unwrap(); let head = self.table[head as usize].rotate_left(self.window_size); let tail = self.table[byte as usize]; self.hash = hash ^ head ^ tail; } else { self.count += 1; let hash = self.hash.rotate_left(1); let tail = self.table[byte as usize]; self.hash = hash ^ tail; } self.buffer.push_back(byte); self.hash } pub fn reset(&mut self) { self.hash = 0; self.count = 0; self.buffer.clear(); } }