#!/usr/bin/env python from __future__ import absolute_import, division, print_function import codecs from operator import itemgetter import sys if __name__ == '__main__': # Get frequency counts of each byte. freqs = [0] * 256 # byte |--> frequency for fpath in sys.argv[1:]: with codecs.open(fpath, 'r', 'utf-8') as fin: for line in fin: for byte in line.strip().encode('utf-8'): freqs[byte] += 1 # Create the inverse mapping. orders = [0] * 256 # byte |--> sort index, descending sort_by_freq = sorted(zip(range(256), freqs), key=itemgetter(1), reverse=True) for sort_idx, byte in enumerate(map(itemgetter(0), sort_by_freq)): orders[byte] = sort_idx # Now write Rust. olines = ['pub const COMMON_INPUTS: [u8; 256] = ['] for byte in range(256): olines.append(' %3d, // %r' % (orders[byte], chr(byte))) olines.append('];') olines.append('') olines.append('pub const COMMON_INPUTS_INV: [u8; 256] = [') for sort_idx in range(256): byte = orders.index(sort_idx) if byte <= 127: olines.append(' b%r,' % chr(byte)) else: olines.append(" b'\\x%x'," % byte) olines.append('];') print('\n'.join(olines))