# This is a part of encoding-next. # Copyright (c) 2013-2015, Kang Seonghoon. # See README.md and LICENSE.txt for details. import urllib.request import urllib.parse import urllib.error import sys import os.path import re import heapq import argparse from pathlib import Path def open_index(path, comments): for line in path.open('rb'): line = line.strip() if not line: continue if line.startswith(b'#'): comments.append('//' + line.decode()[1:]) continue parts = line.split(None, 2) key = int(parts[0], 0) value = int(parts[1], 0) yield key, value def read_index(opts, crate, name, comments): dirname = Path(__file__).parent.resolve() / crate path = dirname / f'index-{name}.txt' if path.is_file(): # For index-armscii-8 return open_index(path, comments) os.makedirs(opts.cache_dir, exist_ok=True) cached_path = opts.cache_dir / f'{name}.txt' if not opts.flush_cache and cached_path.exists(): print('(cached)', end='', file=sys.stderr) else: try: urllib.request.urlretrieve( f'https://encoding.spec.whatwg.org/index-{name}.txt', cached_path ) except Exception: try: os.unlink(cached_path) except OSError: pass raise return open_index(cached_path, comments) def mkdir_and_open(crate, name): crate_path = Path(__file__).parent.resolve() / crate os.makedirs(crate_path, exist_ok=True) rust_file = crate_path / ('%s.rs' % name.replace('-', '_')) return rust_file.open('wt', encoding='utf-8') def dedent(s): return re.sub(r'(?m)^\s*\|?', '', s) def write_header(f, name, comments): print('// AUTOGENERATED FROM index-%s.txt, ORIGINAL COMMENT FOLLOWS:' % name, file=f) print('//', file=f) for line in comments: print(line, file=f) def write_fmt(f, args, fmt_or_cond, thenfmt=None, elsefmt=None, **kwargs): if thenfmt is not None: fmt = thenfmt if fmt_or_cond else elsefmt else: fmt = fmt_or_cond if fmt: kwargs.update(args) f.write(dedent(fmt).format(**kwargs)) def write_comma_separated(f, prefix, l, width=80): buffered = '' for i in l: i = str(i) if len(prefix) + len(buffered) + len(i) <= width: buffered += i else: print(prefix + buffered.rstrip(), file=f) buffered = i if buffered: print(prefix + buffered.rstrip(), file=f) def optimize_overlapping_blocks(blocks): # let's imagine that there are three blocks of size 8: # [X,X,1,2,3,X,X,X], [4,X,X,5,X,X,X,X], [X,X,X,X,X,X,X,6] # concatenating them as is results in size 24. however if we are # allowed to overlap them, there are shorter ways: # [X,X,1,2,3,X,X,X,4,X,X,5,X,X,X,X,X,X,X,6] # [4,X,X,X,5,X,X,X,1,2,3,X,X,X,X,X,X,X,6] (optimal) # this function returns a list of indices to reordered blocks, # with overlapping shifts before putting them. # the first block gets the maximal shift to simplify things. # e.g. the first example would return [(0,2), (1,0), (2,4)]; # the second example would return [(1,0), (0,2), (2,3)]. # # if we define f(x,y) to be the maximal saving when block y is # next to x (note that f(x,y) != f(y,x)), this reduces to # the longest TSP where the vertex is a block and the edge weight # is f(x,y). this is NP-hard as always with a cool algorithm (ugh). # we therefore stick to a simple greedy algorithm that # *always* pick the largest saving at that time. pregaps = [] postgaps = [] for idx, blk in enumerate(blocks): assert any(x is not None for x in blk), 'no empty block allowed' for i, v in enumerate(blk): if v is not None: pregaps.append((-i, idx)) break for i, v in enumerate(reversed(blk)): if v is not None: postgaps.append((-i, idx)) break heapq.heapify(pregaps) heapq.heapify(postgaps) # a simple disjoint-set data structure group = [(i, 0) for i in range(len(blocks))] # (parent, rank) def get_group(i): parent, rank = group[i] if parent != i: parent = get_group(parent) group[i] = parent, rank return parent def merge_groups(i, j): i = get_group(i) j = get_group(j) if i != j: iparent, irank = group[i] jparent, jrank = group[j] if irank < jrank: group[i] = jparent, irank elif irank > jrank: group[j] = iparent, jrank else: group[i] = iparent, irank + 1 group[j] = iparent, jrank nextblk = {} prevblk = {} for i in range(len(blocks) - 1): # <-- postgap ---> # -----================] preblk # postblk [============-------- # <- pregap -> postgap, preblk = heapq.heappop(postgaps) pregap, postblk = heapq.heappop(pregaps) # avoid making a cycle. pregroup = get_group(preblk) rejected = [] while pregroup == get_group(postblk): rejected.append((pregap, postblk)) pregap, postblk = heapq.heappop(pregaps) for item in rejected: heapq.heappush(pregaps, item) assert preblk not in nextblk nextblk[preblk] = postblk, min(-pregap, -postgap) prevblk[postblk] = preblk merge_groups(preblk, postblk) pregap, blk = pregaps[0] ret = [(blk, -pregap)] while blk in nextblk: blk, shift = nextblk.pop(blk) ret.append((blk, shift)) assert len(ret) == len(blocks) return ret def make_minimal_trie(invdata, lowerlimit): maxvalue = max(invdata) + 1 best = 0xffffffff besttrie = None for triebits in range(21): blocks = [] upperidx = [] blockmap = {(None,) * (1 << triebits): -1} for i in range(0, maxvalue, 1 << triebits): blk = [invdata.get(j) for j in range(i, i + (1 << triebits))] blockidx = blockmap.get(tuple(blk)) if blockidx is None: blockidx = len(blocks) blockmap[tuple(blk)] = blockidx blocks.append(blk) upperidx.append(blockidx) lower = [None] * (1 << triebits) uppermap = {-1: 0} for idx, shift in optimize_overlapping_blocks(blocks): blk = blocks[idx] assert shift == 0 or lower[-shift:] == blk[:shift] uppermap[idx] = len(lower) - shift lower += blk[shift:] upper = [uppermap[idx] for idx in upperidx] if len(lower) < lowerlimit and best > len(lower) + len(upper): best = len(lower) + len(upper) besttrie = (triebits, lower, upper) return besttrie def make_minimal_search(data, invdata, premap, maxsearch): minkey = min(data) maxvalue = max(invdata) + 1 best = 0xffffffff bestsearch = None for searchbits in range(21): lower = [] upper = [] for i in range(0, maxvalue, 1 << searchbits): v = sorted(premap(invdata[j]) for j in range(i, i + (1 << searchbits)) if j in invdata) if v: w = sorted((y - x, j) for j, (x, y) in enumerate(zip(v, v[1:]))) count = v[-1] - v[0] block = [v[0], v[-1]] for k, j in reversed(w): if count <= maxsearch: break assert v[j + 1] - v[j] == k count -= k block.append(v[j]) block.append(v[j + 1]) block.sort() assert minkey <= block[0] and block[-1] < 0x7fff # (s, e) when s < 0x8000 is a range [s, e) # (s, e) when s >= 0x8000 is a single pair s.t. invdata[e] = s & 0x7fff block = [(block[i] - minkey, block[i + 1] - minkey + 1) if block[i] < block[i + 1] else (0x8000 | (block[i] - minkey), data[block[i]] & 0xffff) for i in range(0, len(block), 2)] assert all(block[i] != block[i + 1] for i in range(len(block) - 1)) else: block = [] upper.append(len(lower)) lower += block upper.append(len(lower)) if best >= len(lower) + 2 * len(upper): best = len(lower) + 2 * len(upper) bestsearch = (searchbits, lower, upper) return bestsearch def generate_single_byte_index(opts, crate, name): forward, backward = {}, {} comments = [] for key, value in read_index(opts, crate, name, comments): assert 0 <= key < 128 and 0 <= value < 0xffff and key not in forward and value not in backward forward[key] = value backward[value] = key with mkdir_and_open(crate, name) as f: write_header(f, name, comments) def p(*args, **kwargs): print(*args, **kwargs, file=f) p("#[inline]") p("pub fn forward(code: u8) -> u16 {") p(" match code {") for key, value in forward.items(): p(f" {key} => {value},") p(" _ => 0xffff") p(" }") p("}") p() p("#[inline]") p("pub fn backward(code: u32) -> u8 {") p(" match code {") for key, value in backward.items(): p(f" {key} => {value},") p(" _ => 0") p(" }") p("}") p() p("#[cfg(test)]") p("single_byte_tests! {}") return 0, 0, 0 def generate_multi_byte_index(opts, crate, name): # some indices need an additional function for efficient mapping. def premap(i): return i premapcode = '' if not opts.no_premapping: if name == 'euc-kr': def premap(i): r, c = divmod(i, 190) if c >= 96: if r < 44: pass elif r < 47: return None elif r < 72: r -= 3 elif r < 73: return None else: r -= 4 return r * (190 - 96) + (c - 96) else: if c < 26: pass elif c < 32: return None elif c < 58: c -= 6 elif c < 64: return None else: c -= 12 return (125 - 4) * (190 - 96) + r * (96 - 12) + c premapcode = dedent('''\ | |fn premap_forward(code: u16) -> u16 { | let r = code / 190; | let c = code % 190; | if c >= 96 { | let dr = match r { | 0..=43 => 0, | 44..=46 => return X, | 47..=71 => 3, | 72 => return X, | 73..=124 => 4, | _ => return X, | }; | (r - dr) * (190 - 96) + (c - 96) | } else { | let dc = match c { | 0..=25 => 0, | 26..=31 => return X, | 32..=57 => 6, | 58..=63 => return X, | _ => 12, | }; | (125 - 4) * (190 - 96) + r * (96 - 12) + (c - dc) | } |} | |#[cfg(feature = "no-optimized-legacy-encoding")] |fn premap_backward(code: u16) -> u16 { | if code < (125 - 4) * (190 - 96) { | let r = code / (190 - 96); | let c = code % (190 - 96); | let dr = match r { | 0..=43 => 0, | 44..=68 => 3, | _ => 4, | }; | (r + dr) * 190 + (c + 96) | } else if code < X { | let code = code - (125 - 4) * (190 - 96); | let r = code / (96 - 12); | let c = code % (96 - 12); | let dc = match c { | 0..=25 => 0, | 26..=51 => 6, | _ => 12, | }; | r * 190 + (c + dc) | } else { | X | } |} ''') elif name == 'jis0208': def premap(i): if i < 690: pass elif i < 1128: return None elif i < 1220: i -= 438 elif i < 1410: return None elif i < 7808: i -= 628 elif i < 8272: return None elif i < 8648: i -= 1092 elif i < 10716: return None else: i -= 3160 return i premapcode = dedent('''\ | |fn premap_forward(code: u16) -> u16 { | match code { | 0..=689 => code, | 690..=1127 => X, | 1128..=1219 => code - 438, | 1220..=1409 => X, | 1410..=7807 => code - 628, | 7808..=8271 => X, | 8272..=8647 => code - 1092, | 8648..=10715 => X, | _ => code - 3160, | } |} | |#[cfg(feature = "no-optimized-legacy-encoding")] |fn premap_backward(code: u16) -> u16 { | match code { | 0..=689 => code, | 690..=781 => code + 438, | 782..=7179 => code + 628, | 7180..=7555 => code + 1092, | _ => code.saturating_add(3160), | } |} ''') elif name == 'jis0212': def premap(i): if i < 175: pass elif i < 534: return None elif i < 1027: i -= 359 elif i < 1410: return None else: i -= 742 return i premapcode = dedent('''\ | |fn premap_forward(code: u16) -> u16 { | match code { | 0..=174 => code, | 175..=533 => X, | 534..=1026 => code - 359, | 1027..=1409 => X, | _ => code - 742, | } |} | |#[cfg(feature = "no-optimized-legacy-encoding")] |fn premap_backward(code: u16) -> u16 { | match code { | 0..=174 => code, | 175..=667 => code + 359, | _ => code.saturating_add(742), | } |} ''') data = {} # key => value invdata = {} # (the first) value => key, with some exceptions dups = [] # any value that is not mapped in invdata rawdups = [] # same to dups but a literal Rust code comments = [] # the comments in the index file morebits = False # True if the mapping needs SIP for key, value in read_index(opts, crate, name, comments): assert 0 <= key < 0xffff and 0 <= value < 0x110000 and value != 0xffff and key not in data if value >= 0x10000: assert (value >> 16) == 2 morebits = True data[key] = value if value not in invdata: invdata[value] = key else: dups.append(key) if name == 'big5': # Big5 has four two-letter forward mappings, we use special entries for them specialidx = [1133, 1135, 1164, 1166] assert all(key not in data for key in specialidx) assert all(value not in invdata for value in range(len(specialidx))) for value, key in enumerate(specialidx): data[key] = value dups.append(key) # no consistency testing for them # and HKSCS additions are entirely missing from the backward mapping hkscslimit = (0xa1 - 0x81) * 157 for value, key in list(invdata.items()): if key < hkscslimit: del invdata[value] rawdups.append('0..=%d' % (hkscslimit - 1)) # no consistency testing for them # there are also some duplicate entries where the *later* mapping is canonical swappedcanon = [0x2550, 0x255E, 0x2561, 0x256A, 0x5341, 0x5345] olddups = dups dups = [] for key in olddups: value = data[key] if value in swappedcanon: dups.append(invdata[value]) invdata[value] = key else: dups.append(key) dups = [i for i in dups if i >= hkscslimit] # cleanup # JIS X 0208 index has two ranges [8272,8836) and [8836,11280) to support two slightly # different encodings EUC-JP and Shift_JIS; the default backward function would favor # the former, so we need a separate mapping for the latter. # # fortunately for us, all allocated codes in [8272,8836) have counterparts in others, # so we only need a smaller remapping from [8272,8836) to other counterparts. remap = None if name == 'jis0208': REMAP_MIN = 8272 REMAP_MAX = 8835 invdataminusremap = {} for key, value in list(data.items()): if value not in invdataminusremap and not REMAP_MIN <= key <= REMAP_MAX: invdataminusremap[value] = key remap = [] for i in range(REMAP_MIN, REMAP_MAX + 1): if i in data: assert data[i] in invdataminusremap value = invdataminusremap[data[i]] assert value < 0x10000 remap.append(value) else: remap.append(0xffff) newdata = {} for key, value in list(data.items()): key = premap(key) assert key is not None and 0 <= key < 0x10000 and key not in newdata newdata[key] = value data = newdata # generate a trie and search index with a minimal amount of data triebits, trielower, trieupper = make_minimal_trie(invdata, lowerlimit=0x10000) searchbits, searchlower, searchupper = make_minimal_search( data, invdata, premap, maxsearch=opts.max_backward_search_multibyte) # if the search degenerated to the full linear search, use a special code for them fulllinearsearch = (searchupper == [0, 1]) minkey = min(data) maxkey = max(data) + 1 args = dict( premapcode=premapcode, maxvalue=max(invdata), dataoff=minkey, datasz=maxkey - minkey, triebits=triebits, triemask=(1 << triebits) - 1, trielowersz=len(trielower), trieuppersz=len(trieupper), fulllinearsearch=fulllinearsearch, searchbits=searchbits, searchmask=(1 << searchbits) - 1, searchlowersz=len(searchlower), searchuppersz=len(searchupper), searchupperszm1=len(searchupper) - 1, ) if remap: args.update( remapsz=len(remap), remapmin=REMAP_MIN, remapmax=REMAP_MAX, ) with mkdir_and_open(crate, name) as f: write_header(f, name, comments) write_fmt(f, args, '''\ | |#[allow(dead_code)] const X: u16 = 0xffff; |{premapcode} |const FORWARD_TABLE: &'static [u16] = &[ ''') write_comma_separated(f, ' ', ['%s, ' % (data[key] & 0xffff if key in data else 'X') for key in range(minkey, maxkey)]) write_fmt(f, args, '''\ |]; // {datasz} entries ''') if morebits: bits = [] for i in range(minkey, maxkey, 32): v = 0 for j in range(32): v |= (data.get(i + j, 0) >= 0x10000) << j bits.append(v) write_fmt(f, args, '''\ | |const FORWARD_TABLE_MORE: &'static [u32] = &[ ''') write_comma_separated(f, ' ', ['%d, ' % v for v in bits]) write_fmt(f, args, '''\ |]; // {moresz} entries ''', moresz=len(bits)) write_fmt(f, args, '''\ | |/// Returns the index code point for pointer `code` in this index. |#[inline] |pub fn forward(code: u16) -> u32 {{ ''') write_fmt(f, args, premapcode, '''\ | let code = premap_forward(code); ''') write_fmt(f, args, minkey != 0, '''\ | let code = (code as usize).wrapping_sub({dataoff}); ''', '''\ | let code = code as usize; ''') write_fmt(f, args, '''\ | if code < {datasz} {{ ''') write_fmt(f, args, morebits, '''\ | (FORWARD_TABLE[code] as u32) | (((FORWARD_TABLE_MORE[code >> 5] >> (code & 31)) & 1) << 17) ''', '''\ | FORWARD_TABLE[code] as u32 ''') write_fmt(f, args, '''\ | }} else {{ | X as u32 | }} |}} | |#[cfg(not(feature = "no-optimized-legacy-encoding"))] |const BACKWARD_TABLE_LOWER: &'static [u16] = &[ ''') write_comma_separated(f, ' ', ['%s, ' % ('X' if v is None else v) for v in trielower]) write_fmt(f, args, '''\ |]; // {trielowersz} entries | |#[cfg(not(feature = "no-optimized-legacy-encoding"))] |const BACKWARD_TABLE_UPPER: &'static [u16] = &[ ''') write_comma_separated(f, ' ', ['%d, ' % v for v in trieupper]) write_fmt(f, args, '''\ |]; // {trieuppersz} entries ''') if not fulllinearsearch: write_fmt(f, args, '''\ | |#[cfg(feature = "no-optimized-legacy-encoding")] |const BACKWARD_SEARCH_LOWER: &'static [(u16, u16)] = &[ ''') write_comma_separated(f, ' ', ['(%d, %d), ' % (lo, hi) for lo, hi in searchlower]) write_fmt(f, args, '''\ |]; // {searchlowersz} entries | |#[cfg(feature = "no-optimized-legacy-encoding")] |const BACKWARD_SEARCH_UPPER: &'static [u16] = &[ ''') write_comma_separated(f, ' ', ['%d, ' % v for v in searchupper]) write_fmt(f, args, '''\ |]; // {searchuppersz} entries ''') if remap: write_fmt(f, args, '''\ | |const BACKWARD_TABLE_REMAPPED: &'static [u16] = &[ ''') write_comma_separated(f, ' ', ['%d, ' % v for v in remap]) write_fmt(f, args, '''\ |]; // {remapsz} entries ''') write_fmt(f, args, '''\ | |/// Returns the index pointer for code point `code` in this index. |#[inline] |#[cfg(not(feature = "no-optimized-legacy-encoding"))] |pub fn backward(code: u32) -> u16 {{ | let offset = (code >> {triebits}) as usize; | let offset = if offset < {trieuppersz} {{BACKWARD_TABLE_UPPER[offset] as usize}} else {{0}}; | // BACKWARD_TABLE_LOWER stores the actual (pre-mapped) value | // so we don't have to call premap_backward here. | BACKWARD_TABLE_LOWER[offset + ((code & {triemask}) as usize)] |}} | |/// Returns the index pointer for code point `code` in this index. |#[cfg(feature = "no-optimized-legacy-encoding")] |pub fn backward(code: u32) -> u16 {{ | // avoid mistaking a placeholder for the actual value | if code == X as u32 {{ return 0xffff; }} | let codelo = (code & 0xffff) as u16; ''') retexpr = ('premap_backward(%s)' if premapcode else '%s') % ( '(%s) + {dataoff}' if minkey != 0 else '%s') if morebits: write_fmt(f, args, '''\ | let codehi = code >> 16; | #[inline] fn verify_and_map(codehi: u32, i: u16) -> Option {{ | let hi = ((FORWARD_TABLE_MORE[i as usize >> 5] >> (i & 31)) & 1) << 1; | if hi != codehi {{ return None; }} | Some(''' + (retexpr % 'i') + ''') | }} ''') retifcorrect = 'if let Some(i_) = verify_and_map(codehi, %s) {{ return i_; }}' else: retifcorrect = 'return %s;' % (retexpr % '%s') write_fmt(f, args, not fulllinearsearch, '''\ | let offset = (code >> {searchbits}) as usize; | let (start, end) = if offset < {searchupperszm1} {{ | (BACKWARD_SEARCH_UPPER[offset], BACKWARD_SEARCH_UPPER[offset+1]) | }} else {{ | (0, 0) | }}; | for &(s, e) in &BACKWARD_SEARCH_LOWER[(start as usize)..(end as usize)] {{ | if s >= 0x8000 {{ | if e == codelo {{ | ''' + (retifcorrect % 's & 0x7fff') + ''' | }} | }} else {{ | for i in s..e {{ | if FORWARD_TABLE[i as usize] == codelo {{ | ''' + (retifcorrect % 'i') + ''' | }} | }} | }} | }} ''', '''\ | if code <= {maxvalue} {{ | for (i, &v) in FORWARD_TABLE.iter().enumerate() {{ | if v == codelo {{ | ''' + (retifcorrect % 'i as u16') + ''' | }} | }} | }} ''') write_fmt(f, args, '''\ | X |}} ''') write_fmt(f, args, name == 'jis0208', '''\ | |/// Returns the index shift_jis pointer for code point `code`. |#[inline] |pub fn backward_remapped(code: u32) -> u16 {{ | let value = backward(code); | if {remapmin} <= value && value <= {remapmax} {{ | BACKWARD_TABLE_REMAPPED[(value - {remapmin}) as usize] | }} else {{ | value | }} |}} ''') write_fmt(f, args, '''\ | |#[cfg(test)] |multi_byte_tests! {{ ''') write_fmt(f, args, remap, '''\ | remap = [{remapmin}, {remapmax}], ''') if dups or rawdups: write_fmt(f, args, '''\ | dups = [ ''') write_comma_separated(f, ' ', ['%s, ' % v for v in rawdups] + ['%d, ' % v for v in sorted(dups)]) write_fmt(f, args, '''\ | ] ''') else: write_fmt(f, args, '''\ | dups = [] ''') write_fmt(f, args, '''\ |}} ''') forwardsz = 2 * (maxkey - minkey) backwardsz = 2 * len(trielower) + 2 * len(trieupper) backwardszslow = 2 * len(searchlower) + 4 * len(searchupper) backwardmore = 0 if morebits: backwardmore += 4 * ((maxkey - minkey + 31) // 32) if remap: backwardmore += 2 * len(remap) return forwardsz, backwardsz + backwardmore, backwardszslow + backwardmore def generate_multi_byte_range_lbound_index(opts, crate, name): data = [] comments = [] for key, value in read_index(opts, crate, name, comments): data.append((key, value)) assert data and data == sorted(data) minkey, minvalue = data[0] maxkey, maxvalue = data[-1] if data[0] != (0, 0): data.insert(0, (0, 0)) maxlog2 = 0 while 2**(maxlog2 + 1) <= len(data): maxlog2 += 1 if name == 'gb18030-ranges': keyubound = 0x110000 valueubound = 126 * 10 * 126 * 10 else: keyubound = maxkey + 1 valueubound = maxvalue + 1 args = dict( datasz=len(data), minkey=minkey, maxkey=maxkey, keyubound=keyubound, minvalue=minvalue, maxvalue=maxvalue, valueubound=valueubound, ) with mkdir_and_open(crate, name) as f: write_header(f, name, comments) write_fmt(f, args, '''\ | |const FORWARD_TABLE: &'static [u32] = &[ ''') write_comma_separated(f, ' ', ['%d, ' % value for key, value in data]) write_fmt(f, args, '''\ |]; // {datasz} entries | |const BACKWARD_TABLE: &'static [u32] = &[ ''') write_comma_separated(f, ' ', ['%d, ' % key for key, value in data]) write_fmt(f, args, '''\ |]; // {datasz} entries | |fn search(code: u32, fromtab: &'static [u32], totab: &'static [u32]) -> u32 {{ | let mut i = if code >= fromtab[{firstoff}] {{{firstdelta}}} else {{0}}; ''', firstoff=2**maxlog2 - 1, firstdelta=len(data) - 2**maxlog2 + 1) for i in range(maxlog2 - 1, -1, -1): write_fmt(f, args, '''\ | if code >= fromtab[i{plusoff}] {{ i += {delta}; }} ''', plusoff='+%d' % (2**i - 1) if i > 0 else '', delta=2**i) write_fmt(f, args, '''\ | (code - fromtab[i-1]) + totab[i-1] |}} | |/// Returns the index code point for pointer `code` in this index. |#[inline] |pub fn forward(code: u32) -> u32 {{ ''') write_fmt(f, args, minkey > 0, '''\ | if code < {minkey} {{ return 0xffffffff; }} ''') # GB 18030 has "invalid" region inside and a singular mapping write_fmt(f, args, name == 'gb18030-ranges', '''\ | if (code > 39419 && code < 189000) || code > 1237575 {{ return 0xffffffff; }} | if code == 7457 {{ return 0xe7c7; }} ''') write_fmt(f, args, '''\ | search(code, BACKWARD_TABLE, FORWARD_TABLE) |}} | |/// Returns the index pointer for code point `code` in this index. |#[inline] |pub fn backward(code: u32) -> u32 {{ ''') write_fmt(f, args, minvalue > 0, '''\ | if code < {minvalue} {{ return 0xffffffff; }} ''') # GB 18030 has a singular mapping write_fmt(f, args, name == 'gb18030-ranges', '''\ | if code == 0xe7c7 {{ return 7457; }} ''') write_fmt(f, args, '''\ | search(code, FORWARD_TABLE, BACKWARD_TABLE) |}} | |#[cfg(test)] |multi_byte_range_tests! {{ | key = [{minkey}, {maxkey}], key < {keyubound}, | value = [{minvalue}, {maxvalue}], value < {valueubound} |}} ''') forwardsz = 4 * len(data) backwardsz = 4 * len(data) return forwardsz, backwardsz, backwardsz INDICES = [ ('singlebyte/armscii-8', generate_single_byte_index), ('singlebyte/ibm866', generate_single_byte_index), ('singlebyte/iso-8859-2', generate_single_byte_index), ('singlebyte/iso-8859-3', generate_single_byte_index), ('singlebyte/iso-8859-4', generate_single_byte_index), ('singlebyte/iso-8859-5', generate_single_byte_index), ('singlebyte/iso-8859-6', generate_single_byte_index), ('singlebyte/iso-8859-7', generate_single_byte_index), ('singlebyte/iso-8859-8', generate_single_byte_index), ('singlebyte/iso-8859-10', generate_single_byte_index), ('singlebyte/iso-8859-13', generate_single_byte_index), ('singlebyte/iso-8859-14', generate_single_byte_index), ('singlebyte/iso-8859-15', generate_single_byte_index), ('singlebyte/iso-8859-16', generate_single_byte_index), ('singlebyte/koi8-r', generate_single_byte_index), ('singlebyte/koi8-u', generate_single_byte_index), ('singlebyte/macintosh', generate_single_byte_index), ('singlebyte/windows-874', generate_single_byte_index), ('singlebyte/windows-1250', generate_single_byte_index), ('singlebyte/windows-1251', generate_single_byte_index), ('singlebyte/windows-1252', generate_single_byte_index), ('singlebyte/windows-1253', generate_single_byte_index), ('singlebyte/windows-1254', generate_single_byte_index), ('singlebyte/windows-1255', generate_single_byte_index), ('singlebyte/windows-1256', generate_single_byte_index), ('singlebyte/windows-1257', generate_single_byte_index), ('singlebyte/windows-1258', generate_single_byte_index), ('singlebyte/x-mac-cyrillic', generate_single_byte_index), ('tradchinese/big5', generate_multi_byte_index), ('korean/euc-kr', generate_multi_byte_index), ('simpchinese/gb18030', generate_multi_byte_index), ('japanese/jis0208', generate_multi_byte_index), ('japanese/jis0212', generate_multi_byte_index), ('simpchinese/gb18030-ranges', generate_multi_byte_range_lbound_index), ] def main(): parser = argparse.ArgumentParser() parser.add_argument( '-f', '--flush-cache', action='store_true', help='flush the download cache' ) parser.add_argument( '--cache-dir', default=Path(__file__).parent.resolve() / '.cache', help='set the download cache directory [default: %(default)s]', type=Path ) parser.add_argument( '--singlebyte', dest='func_filter', action='store_const', const=generate_single_byte_index, help='generate only single-byte indices' ) parser.add_argument( '--multibyte', dest='func_filter', action='store_const', const=generate_multi_byte_index, help='generate only multi-byte indices' ) parser.add_argument( '--max-backward-search-multibyte', type=lambda v: int(v, 0), metavar='MAX_SEARCH', default='0x200', help='set the max search limit of the unoptimized backward mapping ' 'for multi-byte indices [default: %(default)s]' ) parser.add_argument( '--no-premapping', action='store_true', help='disable premapping; trades table size for decoder performance' ) parser.add_argument( 'filters', nargs='*', help='substring of indices to regenerate' ) opts = parser.parse_args() totalsz = totalszslow = 0 for index, generate in INDICES: crate, __, index = index.partition('/') if opts.filters and all(s not in index for s in opts.filters): continue if opts.func_filter and generate is not opts.func_filter: continue print('generating index %s...' % index, end='', file=sys.stderr) forwardsz, backwardsz, backwardszslow = generate(opts, crate, index) totalsz += forwardsz + backwardsz totalszslow += forwardsz + backwardszslow print('%d + %d (%d) = %d (%d) bytes.' % (forwardsz, backwardsz, backwardszslow, forwardsz + backwardsz, forwardsz + backwardszslow), file=sys.stderr) print('total %d (%d) bytes.' % (totalsz, totalszslow), file=sys.stderr) if __name__ == '__main__': main()