// Copyright 2014 Strahinja Val Markovic // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. use std::char; // More details: http://en.wikipedia.org/wiki/UTF-8#Description pub static UTF8_1BYTE_FOLLOWING: u8 = 0b11000000; pub static UTF8_2BYTE_FOLLOWING: u8 = 0b11100000; pub static UTF8_3BYTE_FOLLOWING: u8 = 0b11110000; pub fn readCodepoint( input: &[u8] ) -> Option< char > { fn isContinuationByte( byte: u8 ) -> bool { byte & 0b11000000 == 0b10000000 } fn codepointBitsFromLeadingByte( byte: u8 ) -> u32 { let good_bits = if isAscii( byte ) { byte } else if byte & 0b11100000 == UTF8_1BYTE_FOLLOWING { byte & 0b00011111 } else if byte & 0b11110000 == UTF8_2BYTE_FOLLOWING { byte & 0b00001111 } else { byte & 0b00000111 }; good_bits as u32 } fn codepointBitsFromContinuationByte( byte: u8 ) -> u32 { ( byte & 0b00111111 ) as u32 } input.get( 0 ) .and_then( |first_byte| { bytesFollowing( *first_byte ).and_then( |num_following| { let mut codepoint: u32 = codepointBitsFromLeadingByte( *first_byte ) << 6 * num_following; for i in 1 .. num_following + 1 { match input.get( i ) { Some( byte ) if isContinuationByte( *byte ) => { codepoint |= codepointBitsFromContinuationByte( *byte ) << 6 * ( num_following - i ); } _ => return None } } char::from_u32( codepoint ) }) }) } pub fn bytesFollowing( byte: u8 ) -> Option< usize > { if isAscii( byte ) { Some( 0 ) } else if byte & 0b11100000 == UTF8_1BYTE_FOLLOWING { Some( 1 ) } else if byte & 0b11110000 == UTF8_2BYTE_FOLLOWING { Some( 2 ) } else if byte & 0b11111000 == UTF8_3BYTE_FOLLOWING { Some( 3 ) } else { None } } pub fn isAscii( byte: u8 ) -> bool { return byte & 0b10000000 == 0; } #[cfg(test)] mod tests { use super::{readCodepoint, UTF8_1BYTE_FOLLOWING}; #[test] fn readCodepoint_Roundtrip_SimpleAscii() { assert_eq!( 'a', readCodepoint( b"a" ).unwrap() ); assert_eq!( 'z', readCodepoint( b"z" ).unwrap() ); assert_eq!( 'A', readCodepoint( b"A" ).unwrap() ); assert_eq!( '9', readCodepoint( b"9" ).unwrap() ); assert_eq!( '*', readCodepoint( b"*" ).unwrap() ); assert_eq!( '\n', readCodepoint( b"\n" ).unwrap() ); assert_eq!( '\0', readCodepoint( b"\0" ).unwrap() ); } #[test] fn readCodepoint_Roundtrip_NonAscii() { // 2 UTF-8 bytes assert_eq!( '¢', readCodepoint( "¢".as_bytes() ).unwrap() ); // 3 UTF-8 bytes assert_eq!( '€', readCodepoint( "€".as_bytes() ).unwrap() ); // 4 UTF-8 bytes assert_eq!( '𤭢', readCodepoint( "𤭢".as_bytes() ).unwrap() ); // Some extras assert_eq!( 'Ć', readCodepoint( "Ć".as_bytes() ).unwrap() ); assert_eq!( 'Ө', readCodepoint( "Ө".as_bytes() ).unwrap() ); assert_eq!( '自', readCodepoint( "自".as_bytes() ).unwrap() ); assert_eq!( '由', readCodepoint( "由".as_bytes() ).unwrap() ); } #[test] fn readCodepoint_FailsOnBadChars() { assert!( readCodepoint( &[ 0b11111111 ] ).is_none() ); assert!( readCodepoint( &[ 0b10000000 ] ).is_none() ); assert!( readCodepoint( &[ UTF8_1BYTE_FOLLOWING, 0b11000000 ] ).is_none() ); } }