# Copyright 2015 Google Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # get ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt # Usage: python tools/mk_puncttable.py UnicodeData.txt > src/puncttable.rs import sys def get_bits(high, punct): b = 0 for i in range(16): if high * 16 + i in punct: b |= 1 << i return b def main(args): ascii_punct = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" ascii_set = set((ord(c) for c in ascii_punct)) punct = set() for line in file(args[1]): spl = line.split(';') if spl[2] in ('Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps'): punct.add(int(spl[0], 16)) pshift = list(set((cp // 16 for cp in punct if cp >= 128))) pshift.sort() bits = [get_bits(high, punct) for high in pshift] print """// Copyright 2015 Google Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. //! CommonMark punctuation set based on spec and Unicode properties. // Autogenerated by mk_puncttable.py const PUNCT_MASKS_ASCII: [u16; 8] = [""" for x in range(8): y = get_bits(x, ascii_set) print ' 0x%04x, // U+%04X...U+%04X' % (y, x * 16, x * 16 + 15) print """ ]; const PUNCT_TAB: [u16; %i] = [""" % len(pshift) for x in pshift: print ' %d, // U+%04X...U+%04X' % (x, x * 16, x * 16 + 15) print """ ]; const PUNCT_MASKS: [u16; %i] = [""" % len(pshift) for i, y in enumerate(bits): x = pshift[i] print ' 0x%04x, // U+%04X...U+%04X' % (y, x * 16, x * 16 + 15) print """ ]; pub fn is_ascii_punctuation(c: u8) -> bool { c < 128 && (PUNCT_MASKS_ASCII[(c / 16) as usize] & (1 << (c & 15))) != 0 } pub fn is_punctuation(c: char) -> bool { let cp = c as u32; if cp < 128 {return is_ascii_punctuation(cp as u8); } if cp > 0x%04X { return false; } let high = (cp / 16) as u16; match PUNCT_TAB.binary_search(&high) { Ok(index) => (PUNCT_MASKS[index] & (1 << (cp & 15))) != 0, _ => false } } #[cfg(test)] mod tests { use super::{is_ascii_punctuation, is_punctuation}; #[test] fn test_ascii() { assert!(is_ascii_punctuation(b'!')); assert!(is_ascii_punctuation(b'@')); assert!(is_ascii_punctuation(b'~')); assert!(!is_ascii_punctuation(b' ')); assert!(!is_ascii_punctuation(b'0')); assert!(!is_ascii_punctuation(b'A')); assert!(!is_ascii_punctuation(0xA1)); } #[test] fn test_unicode() { assert!(is_punctuation('~')); assert!(!is_punctuation(' ')); assert!(is_punctuation('\u{00A1}')); assert!(is_punctuation('\u{060C}')); assert!(is_punctuation('\u{FF65}')); assert!(is_punctuation('\u{1BC9F}')); assert!(!is_punctuation('\u{1BCA0}')); } } """ % max(punct) main(sys.argv)