use BytesBuf; use StrBuf; use std::error; use std::fmt; use std::io; use std::mem; use utf8::{self, Incomplete, DecodeError}; /// A “zero-copy” incremental lossy UTF-8 decoder. /// /// * **“Zero-copy”**: /// String buffers produced by the decoder are either inline /// or share a heap allocation with an input bytes buffer. /// The decoder never allocates memory. /// /// * **Incremental**: /// The input doesn’t need to be provided all at once in a contiguous buffer. /// Whatever input is available can be decoded while waiting for more to arrive, /// for example from the network. /// The decoder takes care of reconstructing `char` code points correctly /// if their UTF-8 bytes span multiple input chunks. /// /// If the entire input *is* available all at once, consider using /// [`StrBuf::from_utf8_lossy`](struct.StrBuf.html#method.from_utf8_lossy) instead. /// /// * **Lossy**: /// Invalid byte sequences (decoding errors) are replaced with the replacement character U+FFFD. /// /// # Examples /// /// This is the [`StrBuf::from_utf8_iter_lossy`](struct.StrBuf.html#method.from_utf8_iter_lossy) /// method: /// /// ``` /// # use zbuf::{BytesBuf, StrBuf, LossyUtf8Decoder}; /// pub fn from_utf8_iter_lossy(iter: I) -> StrBuf /// where I: IntoIterator, I::Item: Into { /// let mut decoder = LossyUtf8Decoder::new(); /// let mut buf = StrBuf::new(); /// for item in iter { /// buf.extend(decoder.feed(item.into())) /// } /// buf.extend(decoder.end()); /// buf /// } /// ``` pub struct LossyUtf8Decoder(StrictUtf8Decoder); impl LossyUtf8Decoder { /// Return a new decoder pub fn new() -> Self { LossyUtf8Decoder(StrictUtf8Decoder::new()) } /// Provide more bytes input to decode. Returns an iterator of `StrBuf`. /// /// The returned iterator must be exhausted (consumed until `.next()` returns `None`) /// before the next call to `.feed(…)` or `.end()`. /// /// # Panics /// /// Panics if the input of a previous `.feed(…)` call was not consumed entirely. pub fn feed(&mut self, next_input_chunk: BytesBuf) -> &mut Self { self.0.feed(next_input_chunk); self } /// Signal the end of the input. This may return one replacement character U+FFFD. /// /// Failing to call this method may result in incorrect decoding. /// /// Note that `Option` implements `IntoIterator`, /// so it can be given for example to an `extend` method. /// /// # Panics /// /// Panics if the input of a previous `.feed(…)` call was not consumed entirely. pub fn end(&mut self) -> Option { self.0.end().err().map(replacement_character) } } // FIXME: Make this a `const` item when const_fn is stable #[inline] fn replacement_character(_: Utf8DecoderError) -> StrBuf { StrBuf::from(utf8::REPLACEMENT_CHARACTER) } impl Iterator for LossyUtf8Decoder { type Item = StrBuf; fn next(&mut self) -> Option { self.0.next().map(|result| result.unwrap_or_else(replacement_character)) } } /// A “zero-copy” incremental strict UTF-8 decoder. /// /// * **“Zero-copy”**: /// String buffers produced by the decoder are either inline /// or share a heap allocation with an input bytes buffer. /// The decoder never allocates memory. /// /// * **Incremental**: /// The input doesn’t need to be provided all at once in a contiguous buffer. /// Whatever input is available can be decoded while waiting for more to arrive, /// for example from the network. /// The decoder takes care of reconstructing `char` code points correctly /// if their UTF-8 bytes span multiple input chunks. /// /// If the entire input *is* available all at once, consider using /// [`StrBuf::from_utf8_lossy`](struct.StrBuf.html#method.from_utf8_lossy) instead. /// /// * **Strict**: /// Invalid byte sequences are represented as `Result::Err` /// /// # Examples /// /// This is the [`StrBuf::from_utf8_iter`](struct.StrBuf.html#method.from_utf8_iter) method: /// /// ``` /// # use zbuf::{BytesBuf, StrBuf, StrictUtf8Decoder, Utf8DecoderError}; /// pub fn from_utf8_iter(iter: I) -> Result /// where I: IntoIterator, I::Item: Into { /// let mut decoder = StrictUtf8Decoder::new(); /// let mut buf = StrBuf::new(); /// for item in iter { /// for result in decoder.feed(item.into()) { /// buf.push_buf(&result?) /// } /// } /// decoder.end()?; /// Ok(buf) /// } /// ``` pub struct StrictUtf8Decoder { input_chunk: BytesBuf, incomplete_char: Incomplete, yield_error_next: bool, sum_chunks_len_so_far: usize, } impl StrictUtf8Decoder { /// Return a new decoder pub fn new() -> Self { StrictUtf8Decoder { incomplete_char: Incomplete::empty(), input_chunk: BytesBuf::new(), yield_error_next: false, sum_chunks_len_so_far: 0, } } fn exhausted(&self) -> bool { self.input_chunk.is_empty() && !self.yield_error_next } /// Provide more bytes input to decode. Returns an iterator of `Result`. /// /// The returned iterator must be exhausted (consumed until `.next()` returns `None`) /// before the next call to `.feed(…)` or `.end()`. /// /// # Panics /// /// Panics if the input of a previous `.feed(…)` call was not consumed entirely. pub fn feed(&mut self, next_input_chunk: BytesBuf) -> &mut Self { assert!(self.exhausted(), "feeding Utf8Decoder before exhausting the previous input chunk"); self.sum_chunks_len_so_far += next_input_chunk.len(); self.input_chunk = next_input_chunk; self } /// Signal the end of the input. This may return an error. /// /// Failing to call this method may result in incorrect decoding. /// /// # Panics /// /// Panics if the input of a previous `.feed(…)` call was not consumed entirely. pub fn end(&mut self) -> Result<(), Utf8DecoderError> { assert!(self.exhausted(), "ending Utf8Decoder before exhausting the previous input chunk"); if self.incomplete_char.is_empty() { Ok(()) } else { self.incomplete_char = Incomplete::empty(); Err(Utf8DecoderError { position: self.sum_chunks_len_so_far, }) } } fn take_input(&mut self) -> BytesBuf { mem::replace(&mut self.input_chunk, BytesBuf::new()) } fn error(&self) -> Utf8DecoderError { Utf8DecoderError { position: self.sum_chunks_len_so_far - self.input_chunk.len(), } } #[cold] fn try_complete(&mut self) -> Option> { // FIXME: simplify when borrows are non-lexical let unborrowed = { let input_chunk = &self.input_chunk; self.incomplete_char.try_complete(input_chunk) .map(|(result, remaining_input)| { let consumed = input_chunk.len() - remaining_input.len(); // `result` here is up to 4 bytes and therefore fits in an inline buffer, // so it is better to not try to share a heap allocation with `input_chunk`. let result = result.map(StrBuf::from).map_err(|_| ()); (consumed, result) }) }; match unborrowed { None => { // Consumed the entire input self.input_chunk = BytesBuf::new(); None } Some((consumed_prefix_len, result)) => { self.input_chunk.pop_front(consumed_prefix_len); Some(result.map_err(|()| self.error())) } } } } impl Iterator for StrictUtf8Decoder { type Item = Result; fn next(&mut self) -> Option> { if self.yield_error_next { self.yield_error_next = false; return Some(Err(self.error())) } if self.input_chunk.is_empty() { return None } if !self.incomplete_char.is_empty() { return self.try_complete() } struct IsIncomplete; // FIXME: simplify when borrows are non-lexical let unborrowed = match utf8::decode(&self.input_chunk) { Ok(_) => Ok(()), Err(DecodeError::Incomplete { valid_prefix, incomplete_suffix }) => { self.incomplete_char = incomplete_suffix; Err((valid_prefix.len(), Ok(IsIncomplete))) } Err(DecodeError::Invalid { valid_prefix, invalid_sequence, remaining_input }) => { let resume_at = if remaining_input.is_empty() { None } else { Some(valid_prefix.len() + invalid_sequence.len()) }; Err((valid_prefix.len(), Err(resume_at))) } }; let mut bytes; match unborrowed { Ok(()) => { bytes = self.take_input() } Err((0, Ok(IsIncomplete))) => { self.input_chunk = BytesBuf::new(); return None } Err((valid_prefix_len, Ok(IsIncomplete))) => { bytes = self.take_input(); bytes.truncate(valid_prefix_len) } Err((0, Err(None))) => { self.input_chunk = BytesBuf::new(); return Some(Err(self.error())) } Err((0, Err(Some(resume_at)))) => { self.input_chunk.pop_front(resume_at); return Some(Err(self.error())) } Err((valid_prefix_len, Err(None))) => { self.yield_error_next = true; bytes = self.take_input(); bytes.truncate(valid_prefix_len); } Err((valid_prefix_len, Err(Some(resume_at)))) => { self.yield_error_next = true; bytes = self.input_chunk.clone(); bytes.truncate(valid_prefix_len); self.input_chunk.pop_front(resume_at); } } unsafe { Some(Ok(StrBuf::from_utf8_unchecked(bytes))) } } } /// The error type for [`StrictUtf8Decoder`](struct.StrictUtf8Decoder.html). #[derive(Debug, Copy, Clone)] pub struct Utf8DecoderError { position: usize, } impl Utf8DecoderError { /// Total number of bytes from the start of the stream to this invalid byte sequence. pub fn position(&self) -> usize { self.position } } impl fmt::Display for Utf8DecoderError { fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { write!(formatter, "invalid UTF-8 byte sequence at byte {}", self.position) } } impl error::Error for Utf8DecoderError { fn description(&self) -> &str { "invalid utf-8" } } impl From for io::Error { fn from(error: Utf8DecoderError) -> Self { Self::new(io::ErrorKind::InvalidData, error) } }