# This is a part of encoding-next.
# Copyright (c) 2013-2015, Kang Seonghoon.
# See README.md and LICENSE.txt for details.
import urllib.request
import urllib.parse
import urllib.error
import sys
import os.path
import re
import heapq
import argparse
from pathlib import Path
CC0_LICENSE = """encoding-next by Kang Seonghoon
To the extent possible under law, the person who associated CC0 with
encoding-next has waived all copyright and related or neighboring rights
to encoding-next.
You should have received a copy of the CC0 legalcode along with this
work. If not, see .
"""
def open_index(path, comments):
for line in path.open('rb'):
line = line.strip()
if not line:
continue
if line.startswith(b'#'):
comments.append('//' + line.decode()[1:])
continue
parts = line.split(None, 2)
key = int(parts[0], 0)
value = int(parts[1], 0)
yield key, value
def read_index(opts, crate, name, comments):
dirname = Path(__file__).parent.resolve() / crate
path = dirname / f'index-{name}.txt'
if path.is_file(): # For index-armscii-8
return open_index(path, comments)
os.makedirs(opts.cache_dir, exist_ok=True)
cached_path = opts.cache_dir / f'{name}.txt'
if not opts.flush_cache and cached_path.exists():
print('(cached)', end='', file=sys.stderr)
else:
try:
urllib.request.urlretrieve(
f'https://encoding.spec.whatwg.org/index-{name}.txt', cached_path
)
except Exception:
try:
os.unlink(cached_path)
except OSError:
pass
raise
return open_index(cached_path, comments)
def write_license_file(crate):
dirname = os.path.join(os.path.dirname(__file__), crate)
try:
os.mkdir(dirname)
except Exception:
pass
with open(os.path.join(dirname, "LICENSE"), 'w') as f:
f.write(CC0_LICENSE)
def mkdir_and_open(crate, name):
crate_path = Path(__file__).parent.resolve() / crate
os.makedirs(crate_path, exist_ok=True)
rust_file = crate_path / ('%s.rs' % name.replace('-', '_'))
return rust_file.open('wt', encoding='utf-8')
def dedent(s):
return re.sub(r'(?m)^\s*\|?', '', s)
def write_header(f, name, comments):
print('// AUTOGENERATED FROM index-%s.txt, ORIGINAL COMMENT FOLLOWS:' % name, file=f)
print('//', file=f)
for line in comments:
print(line, file=f)
def write_fmt(f, args, fmt_or_cond, thenfmt=None, elsefmt=None, **kwargs):
if thenfmt is not None:
fmt = thenfmt if fmt_or_cond else elsefmt
else:
fmt = fmt_or_cond
if fmt:
kwargs.update(args)
f.write(dedent(fmt).format(**kwargs))
def write_comma_separated(f, prefix, l, width=100):
buffered = ''
for i in l:
i = str(i)
if len(prefix) + len(buffered) + len(i) <= width:
buffered += i
else:
print(prefix + buffered.rstrip(), file=f)
buffered = i
if buffered:
print(prefix + buffered.rstrip(), file=f)
def optimize_overlapping_blocks(blocks):
# let's imagine that there are three blocks of size 8:
# [X,X,1,2,3,X,X,X], [4,X,X,5,X,X,X,X], [X,X,X,X,X,X,X,6]
# concatenating them as is results in size 24. however if we are
# allowed to overlap them, there are shorter ways:
# [X,X,1,2,3,X,X,X,4,X,X,5,X,X,X,X,X,X,X,6]
# [4,X,X,X,5,X,X,X,1,2,3,X,X,X,X,X,X,X,6] (optimal)
# this function returns a list of indices to reordered blocks,
# with overlapping shifts before putting them.
# the first block gets the maximal shift to simplify things.
# e.g. the first example would return [(0,2), (1,0), (2,4)];
# the second example would return [(1,0), (0,2), (2,3)].
#
# if we define f(x,y) to be the maximal saving when block y is
# next to x (note that f(x,y) != f(y,x)), this reduces to
# the longest TSP where the vertex is a block and the edge weight
# is f(x,y). this is NP-hard as always with a cool algorithm (ugh).
# we therefore stick to a simple greedy algorithm that
# *always* pick the largest saving at that time.
pregaps = []
postgaps = []
for idx, blk in enumerate(blocks):
assert any(x is not None for x in blk), 'no empty block allowed'
for i, v in enumerate(blk):
if v is not None:
pregaps.append((-i, idx))
break
for i, v in enumerate(reversed(blk)):
if v is not None:
postgaps.append((-i, idx))
break
heapq.heapify(pregaps)
heapq.heapify(postgaps)
# a simple disjoint-set data structure
group = [(i, 0) for i in range(len(blocks))] # (parent, rank)
def get_group(i):
parent, rank = group[i]
if parent != i:
parent = get_group(parent)
group[i] = parent, rank
return parent
def merge_groups(i, j):
i = get_group(i)
j = get_group(j)
if i != j:
iparent, irank = group[i]
jparent, jrank = group[j]
if irank < jrank:
group[i] = jparent, irank
elif irank > jrank:
group[j] = iparent, jrank
else:
group[i] = iparent, irank + 1
group[j] = iparent, jrank
nextblk = {}
prevblk = {}
for i in range(len(blocks) - 1):
# <-- postgap --->
# -----================] preblk
# postblk [============--------
# <- pregap ->
postgap, preblk = heapq.heappop(postgaps)
pregap, postblk = heapq.heappop(pregaps)
# avoid making a cycle.
pregroup = get_group(preblk)
rejected = []
while pregroup == get_group(postblk):
rejected.append((pregap, postblk))
pregap, postblk = heapq.heappop(pregaps)
for item in rejected:
heapq.heappush(pregaps, item)
assert preblk not in nextblk
nextblk[preblk] = postblk, min(-pregap, -postgap)
prevblk[postblk] = preblk
merge_groups(preblk, postblk)
pregap, blk = pregaps[0]
ret = [(blk, -pregap)]
while blk in nextblk:
blk, shift = nextblk.pop(blk)
ret.append((blk, shift))
assert len(ret) == len(blocks)
return ret
def make_minimal_trie(invdata, lowerlimit):
maxvalue = max(invdata) + 1
best = 0xffffffff
besttrie = None
for triebits in range(21):
blocks = []
upperidx = []
blockmap = {(None,) * (1 << triebits): -1}
for i in range(0, maxvalue, 1 << triebits):
blk = [invdata.get(j) for j in range(i, i + (1 << triebits))]
blockidx = blockmap.get(tuple(blk))
if blockidx is None:
blockidx = len(blocks)
blockmap[tuple(blk)] = blockidx
blocks.append(blk)
upperidx.append(blockidx)
lower = [None] * (1 << triebits)
uppermap = {-1: 0}
for idx, shift in optimize_overlapping_blocks(blocks):
blk = blocks[idx]
assert shift == 0 or lower[-shift:] == blk[:shift]
uppermap[idx] = len(lower) - shift
lower += blk[shift:]
upper = [uppermap[idx] for idx in upperidx]
if len(lower) < lowerlimit and best > len(lower) + len(upper):
best = len(lower) + len(upper)
besttrie = (triebits, lower, upper)
return besttrie
def make_minimal_search(data, invdata, premap, maxsearch):
minkey = min(data)
maxvalue = max(invdata) + 1
best = 0xffffffff
bestsearch = None
for searchbits in range(21):
lower = []
upper = []
for i in range(0, maxvalue, 1 << searchbits):
v = sorted(premap(invdata[j]) for j in range(i, i + (1 << searchbits)) if j in invdata)
if v:
w = sorted((y - x, j) for j, (x, y) in enumerate(zip(v, v[1:])))
count = v[-1] - v[0]
block = [v[0], v[-1]]
for k, j in reversed(w):
if count <= maxsearch:
break
assert v[j + 1] - v[j] == k
count -= k
block.append(v[j])
block.append(v[j + 1])
block.sort()
assert minkey <= block[0] and block[-1] < 0x7fff
# (s, e) when s < 0x8000 is a range [s, e)
# (s, e) when s >= 0x8000 is a single pair s.t. invdata[e] = s & 0x7fff
block = [(block[i] - minkey, block[i + 1] - minkey + 1)
if block[i] < block[i + 1] else
(0x8000 | (block[i] - minkey), data[block[i]] & 0xffff)
for i in range(0, len(block), 2)]
assert all(block[i] != block[i + 1] for i in range(len(block) - 1))
else:
block = []
upper.append(len(lower))
lower += block
upper.append(len(lower))
if best >= len(lower) + 2 * len(upper):
best = len(lower) + 2 * len(upper)
bestsearch = (searchbits, lower, upper)
return bestsearch
def generate_single_byte_index(opts, crate, name):
data = [None] * 128
invdata = {}
comments = []
for key, value in read_index(opts, crate, name, comments):
assert 0 <= key < 128 and 0 <= value < 0xffff and data[key] is None and value not in invdata
data[key] = value
invdata[value] = key
# generate a trie with a minimal amount of data
triebits, trielower, trieupper = make_minimal_trie(invdata, lowerlimit=0x10000)
# generate a bitmap for quickly rejecting invalid chars even in the unoptimized setting
bitlen = 0
while 2**bitlen <= max(invdata):
bitlen += 1
bitmapshift = bitlen - 5
bitmap = 0
for value in invdata:
bitmap |= 1 << (value >> bitmapshift)
assert 2**16 <= bitmap < 2**32
args = dict(
datasz=len(data),
maxvalue=max(invdata),
bitmap=bitmap,
bitmapshift=bitmapshift,
triebits=triebits,
triemask=(1 << triebits) - 1,
trielowersz=len(trielower),
trieuppersz=len(trieupper),
)
with mkdir_and_open(crate, name) as f:
write_header(f, name, comments)
write_fmt(f, args, '''\
|
|#[allow(dead_code)] const X: u16 = 0xffff;
|
|const FORWARD_TABLE: &[u16] = &[
''')
write_comma_separated(f, ' ',
['%s, ' % ('X' if value is None else value) for value in data])
write_fmt(f, args, '''\
|]; // {datasz} entries
|
|/// Returns the index code point for pointer `code` in this index.
|#[inline]
|pub fn forward(code: u8) -> u16 {{
| FORWARD_TABLE[(code - 0x80) as usize]
|}}
|
|#[cfg(not(feature = "no-optimized-legacy-encoding"))]
|const BACKWARD_TABLE_LOWER: &[u8] = &[
''')
write_comma_separated(f, ' ',
['%d, ' % (0 if v is None else v + 0x80) for v in trielower])
write_fmt(f, args, '''\
|]; // {trielowersz} entries
|
|#[cfg(not(feature = "no-optimized-legacy-encoding"))]
|const BACKWARD_TABLE_UPPER: &[u16] = &[
''')
write_comma_separated(f, ' ', ['%d, ' % v for v in trieupper])
write_fmt(f, args, '''\
|]; // {trieuppersz} entries
|
|/// Returns the index pointer for code point `code` in this index.
|#[inline]
|#[cfg(not(feature = "no-optimized-legacy-encoding"))]
|pub fn backward(code: u32) -> u8 {{
| let offset = (code >> {triebits}) as usize;
| let offset = if offset < {trieuppersz} {{BACKWARD_TABLE_UPPER[offset] as usize}} else {{0}};
| BACKWARD_TABLE_LOWER[offset + ((code & {triemask}) as usize)]
|}}
|
|/// Returns the index pointer for code point `code` in this index.
|#[cfg(feature = "no-optimized-legacy-encoding")]
|pub fn backward(code: u32) -> u8 {{
| if code > {maxvalue} || (({bitmap:#x}u32 >> (code >> {bitmapshift})) & 1) == 0 {{ return 0; }}
| let code = code as u16;
| for i in 0..0x80 {{
| if FORWARD_TABLE[i as usize] == code {{ return 0x80 + i; }}
| }}
| 0
|}}
|
|#[cfg(test)]
|encoding_index_tests::single_byte_tests! {{
|}}
''')
forwardsz = 2 * len(data)
backwardsz = len(trielower) + 2 * len(trieupper)
return forwardsz, backwardsz, 0
def generate_multi_byte_index(opts, crate, name):
# some indices need an additional function for efficient mapping.
def premap(i): return i
premapcode = ''
if not opts.no_premapping:
if name == 'euc-kr':
def premap(i):
r, c = divmod(i, 190)
if c >= 96:
if r < 44:
pass
elif r < 47:
return None
elif r < 72:
r -= 3
elif r < 73:
return None
else:
r -= 4
return r * (190 - 96) + (c - 96)
else:
if c < 26:
pass
elif c < 32:
return None
elif c < 58:
c -= 6
elif c < 64:
return None
else:
c -= 12
return (125 - 4) * (190 - 96) + r * (96 - 12) + c
premapcode = dedent('''\
|
|fn premap_forward(code: u16) -> u16 {
| let r = code / 190;
| let c = code % 190;
| if c >= 96 {
| let dr = match r {
| 0..=43 => 0,
| 44..=46 => return X,
| 47..=71 => 3,
| 72 => return X,
| 73..=124 => 4,
| _ => return X,
| };
| (r - dr) * (190 - 96) + (c - 96)
| } else {
| let dc = match c {
| 0..=25 => 0,
| 26..=31 => return X,
| 32..=57 => 6,
| 58..=63 => return X,
| _ => 12,
| };
| (125 - 4) * (190 - 96) + r * (96 - 12) + (c - dc)
| }
|}
|
|#[cfg(feature = "no-optimized-legacy-encoding")]
|fn premap_backward(code: u16) -> u16 {
| if code < (125 - 4) * (190 - 96) {
| let r = code / (190 - 96);
| let c = code % (190 - 96);
| let dr = match r {
| 0..=43 => 0,
| 44..=68 => 3,
| _ => 4,
| };
| (r + dr) * 190 + (c + 96)
| } else if code < X {
| let code = code - (125 - 4) * (190 - 96);
| let r = code / (96 - 12);
| let c = code % (96 - 12);
| let dc = match c {
| 0..=25 => 0,
| 26..=51 => 6,
| _ => 12,
| };
| r * 190 + (c + dc)
| } else {
| X
| }
|}
''')
elif name == 'jis0208':
def premap(i):
if i < 690:
pass
elif i < 1128:
return None
elif i < 1220:
i -= 438
elif i < 1410:
return None
elif i < 7808:
i -= 628
elif i < 8272:
return None
elif i < 8648:
i -= 1092
elif i < 10716:
return None
else:
i -= 3160
return i
premapcode = dedent('''\
|
|fn premap_forward(code: u16) -> u16 {
| match code {
| 0..=689 => code,
| 690..=1127 => X,
| 1128..=1219 => code - 438,
| 1220..=1409 => X,
| 1410..=7807 => code - 628,
| 7808..=8271 => X,
| 8272..=8647 => code - 1092,
| 8648..=10715 => X,
| _ => code - 3160,
| }
|}
|
|#[cfg(feature = "no-optimized-legacy-encoding")]
|fn premap_backward(code: u16) -> u16 {
| match code {
| 0..=689 => code,
| 690..=781 => code + 438,
| 782..=7179 => code + 628,
| 7180..=7555 => code + 1092,
| _ => code.saturating_add(3160),
| }
|}
''')
elif name == 'jis0212':
def premap(i):
if i < 175:
pass
elif i < 534:
return None
elif i < 1027:
i -= 359
elif i < 1410:
return None
else:
i -= 742
return i
premapcode = dedent('''\
|
|fn premap_forward(code: u16) -> u16 {
| match code {
| 0..=174 => code,
| 175..=533 => X,
| 534..=1026 => code - 359,
| 1027..=1409 => X,
| _ => code - 742,
| }
|}
|
|#[cfg(feature = "no-optimized-legacy-encoding")]
|fn premap_backward(code: u16) -> u16 {
| match code {
| 0..=174 => code,
| 175..=667 => code + 359,
| _ => code.saturating_add(742),
| }
|}
''')
data = {} # key => value
invdata = {} # (the first) value => key, with some exceptions
dups = [] # any value that is not mapped in invdata
rawdups = [] # same to dups but a literal Rust code
comments = [] # the comments in the index file
morebits = False # True if the mapping needs SIP
for key, value in read_index(opts, crate, name, comments):
assert 0 <= key < 0xffff and 0 <= value < 0x110000 and value != 0xffff and key not in data
if value >= 0x10000:
assert (value >> 16) == 2
morebits = True
data[key] = value
if value not in invdata:
invdata[value] = key
else:
dups.append(key)
if name == 'big5':
# Big5 has four two-letter forward mappings, we use special entries for them
specialidx = [1133, 1135, 1164, 1166]
assert all(key not in data for key in specialidx)
assert all(value not in invdata for value in range(len(specialidx)))
for value, key in enumerate(specialidx):
data[key] = value
dups.append(key) # no consistency testing for them
# and HKSCS additions are entirely missing from the backward mapping
hkscslimit = (0xa1 - 0x81) * 157
for value, key in list(invdata.items()):
if key < hkscslimit:
del invdata[value]
rawdups.append('0..=%d' % (hkscslimit - 1)) # no consistency testing for them
# there are also some duplicate entries where the *later* mapping is canonical
swappedcanon = [0x2550, 0x255E, 0x2561, 0x256A, 0x5341, 0x5345]
olddups = dups
dups = []
for key in olddups:
value = data[key]
if value in swappedcanon:
dups.append(invdata[value])
invdata[value] = key
else:
dups.append(key)
dups = [i for i in dups if i >= hkscslimit] # cleanup
# JIS X 0208 index has two ranges [8272,8836) and [8836,11280) to support two slightly
# different encodings EUC-JP and Shift_JIS; the default backward function would favor
# the former, so we need a separate mapping for the latter.
#
# fortunately for us, all allocated codes in [8272,8836) have counterparts in others,
# so we only need a smaller remapping from [8272,8836) to other counterparts.
remap = None
if name == 'jis0208':
REMAP_MIN = 8272
REMAP_MAX = 8835
invdataminusremap = {}
for key, value in list(data.items()):
if value not in invdataminusremap and not REMAP_MIN <= key <= REMAP_MAX:
invdataminusremap[value] = key
remap = []
for i in range(REMAP_MIN, REMAP_MAX + 1):
if i in data:
assert data[i] in invdataminusremap
value = invdataminusremap[data[i]]
assert value < 0x10000
remap.append(value)
else:
remap.append(0xffff)
newdata = {}
for key, value in list(data.items()):
key = premap(key)
assert key is not None and 0 <= key < 0x10000 and key not in newdata
newdata[key] = value
data = newdata
# generate a trie and search index with a minimal amount of data
triebits, trielower, trieupper = make_minimal_trie(invdata, lowerlimit=0x10000)
searchbits, searchlower, searchupper = make_minimal_search(
data, invdata, premap, maxsearch=opts.max_backward_search_multibyte)
# if the search degenerated to the full linear search, use a special code for them
fulllinearsearch = (searchupper == [0, 1])
minkey = min(data)
maxkey = max(data) + 1
args = dict(
premapcode=premapcode,
maxvalue=max(invdata),
dataoff=minkey,
datasz=maxkey - minkey,
triebits=triebits,
triemask=(1 << triebits) - 1,
trielowersz=len(trielower),
trieuppersz=len(trieupper),
fulllinearsearch=fulllinearsearch,
searchbits=searchbits,
searchmask=(1 << searchbits) - 1,
searchlowersz=len(searchlower),
searchuppersz=len(searchupper),
searchupperszm1=len(searchupper) - 1,
)
if remap:
args.update(
remapsz=len(remap),
remapmin=REMAP_MIN,
remapmax=REMAP_MAX,
)
with mkdir_and_open(crate, name) as f:
write_header(f, name, comments)
write_fmt(f, args, '''\
|
|#[allow(dead_code)] const X: u16 = 0xffff;
|{premapcode}
|const FORWARD_TABLE: &[u16] = &[
''')
write_comma_separated(f, ' ',
['%s, ' % (data[key] & 0xffff if key in data else 'X')
for key in range(minkey, maxkey)])
write_fmt(f, args, '''\
|]; // {datasz} entries
''')
if morebits:
bits = []
for i in range(minkey, maxkey, 32):
v = 0
for j in range(32):
v |= (data.get(i + j, 0) >= 0x10000) << j
bits.append(v)
write_fmt(f, args, '''\
|
|const FORWARD_TABLE_MORE: &[u32] = &[
''')
write_comma_separated(f, ' ', ['%d, ' % v for v in bits])
write_fmt(f, args, '''\
|]; // {moresz} entries
''',
moresz=len(bits))
write_fmt(f, args, '''\
|
|/// Returns the index code point for pointer `code` in this index.
|#[inline]
|pub fn forward(code: u16) -> u32 {{
''')
write_fmt(f, args, premapcode, '''\
| let code = premap_forward(code);
''')
write_fmt(f, args, minkey != 0, '''\
| let code = (code as usize).wrapping_sub({dataoff});
''', '''\
| let code = code as usize;
''')
write_fmt(f, args, '''\
| if code < {datasz} {{
''')
write_fmt(f, args, morebits, '''\
| (FORWARD_TABLE[code] as u32) | (((FORWARD_TABLE_MORE[code >> 5] >> (code & 31)) & 1) << 17)
''', '''\
| FORWARD_TABLE[code] as u32
''')
write_fmt(f, args, '''\
| }} else {{
| X as u32
| }}
|}}
|
|#[cfg(not(feature = "no-optimized-legacy-encoding"))]
|const BACKWARD_TABLE_LOWER: &[u16] = &[
''')
write_comma_separated(f, ' ',
['%s, ' % ('X' if v is None else v) for v in trielower])
write_fmt(f, args, '''\
|]; // {trielowersz} entries
|
|#[cfg(not(feature = "no-optimized-legacy-encoding"))]
|const BACKWARD_TABLE_UPPER: &[u16] = &[
''')
write_comma_separated(f, ' ', ['%d, ' % v for v in trieupper])
write_fmt(f, args, '''\
|]; // {trieuppersz} entries
''')
if not fulllinearsearch:
write_fmt(f, args, '''\
|
|#[cfg(feature = "no-optimized-legacy-encoding")]
|const BACKWARD_SEARCH_LOWER: &[(u16, u16)] = &[
''')
write_comma_separated(f, ' ',
['(%d, %d), ' % (lo, hi) for lo, hi in searchlower])
write_fmt(f, args, '''\
|]; // {searchlowersz} entries
|
|#[cfg(feature = "no-optimized-legacy-encoding")]
|const BACKWARD_SEARCH_UPPER: &[u16] = &[
''')
write_comma_separated(f, ' ', ['%d, ' % v for v in searchupper])
write_fmt(f, args, '''\
|]; // {searchuppersz} entries
''')
if remap:
write_fmt(f, args, '''\
|
|const BACKWARD_TABLE_REMAPPED: &[u16] = &[
''')
write_comma_separated(f, ' ', ['%d, ' % v for v in remap])
write_fmt(f, args, '''\
|]; // {remapsz} entries
''')
write_fmt(f, args, '''\
|
|/// Returns the index pointer for code point `code` in this index.
|#[inline]
|#[cfg(not(feature = "no-optimized-legacy-encoding"))]
|pub fn backward(code: u32) -> u16 {{
| let offset = (code >> {triebits}) as usize;
| let offset = if offset < {trieuppersz} {{BACKWARD_TABLE_UPPER[offset] as usize}} else {{0}};
| // BACKWARD_TABLE_LOWER stores the actual (pre-mapped) value
| // so we don't have to call premap_backward here.
| BACKWARD_TABLE_LOWER[offset + ((code & {triemask}) as usize)]
|}}
|
|/// Returns the index pointer for code point `code` in this index.
|#[cfg(feature = "no-optimized-legacy-encoding")]
|pub fn backward(code: u32) -> u16 {{
| // avoid mistaking a placeholder for the actual value
| if code == X as u32 {{ return 0xffff; }}
| let codelo = (code & 0xffff) as u16;
''')
retexpr = ('premap_backward(%s)' if premapcode else '%s') % (
'(%s) + {dataoff}' if minkey != 0 else '%s')
if morebits:
write_fmt(f, args, '''\
| let codehi = code >> 16;
| #[inline] fn verify_and_map(codehi: u32, i: u16) -> Option {{
| let hi = ((FORWARD_TABLE_MORE[i as usize >> 5] >> (i & 31)) & 1) << 1;
| if hi != codehi {{ return None; }}
| Some(''' + (retexpr % 'i') + ''')
| }}
''')
retifcorrect = 'if let Some(i_) = verify_and_map(codehi, %s) {{ return i_; }}'
else:
retifcorrect = 'return %s;' % (retexpr % '%s')
write_fmt(f, args, not fulllinearsearch, '''\
| let offset = (code >> {searchbits}) as usize;
| let (start, end) = if offset < {searchupperszm1} {{
| (BACKWARD_SEARCH_UPPER[offset], BACKWARD_SEARCH_UPPER[offset+1])
| }} else {{
| (0, 0)
| }};
| for &(s, e) in &BACKWARD_SEARCH_LOWER[(start as usize)..(end as usize)] {{
| if s >= 0x8000 {{
| if e == codelo {{
| ''' + (retifcorrect % 's & 0x7fff') + '''
| }}
| }} else {{
| for i in s..e {{
| if FORWARD_TABLE[i as usize] == codelo {{
| ''' + (retifcorrect % 'i') + '''
| }}
| }}
| }}
| }}
''', '''\
| if code <= {maxvalue} {{
| for (i, &v) in FORWARD_TABLE.iter().enumerate() {{
| if v == codelo {{
| ''' + (retifcorrect % 'i as u16') + '''
| }}
| }}
| }}
''')
write_fmt(f, args, '''\
| X
|}}
''')
write_fmt(f, args, name == 'jis0208', '''\
|
|/// Returns the index shift_jis pointer for code point `code`.
|#[inline]
|pub fn backward_remapped(code: u32) -> u16 {{
| let value = backward(code);
| if ({remapmin}..={remapmax}).contains(&value) {{
| BACKWARD_TABLE_REMAPPED[(value - {remapmin}) as usize]
| }} else {{
| value
| }}
|}}
''')
write_fmt(f, args, '''\
|
|#[cfg(test)]
|encoding_index_tests::multi_byte_tests! {{
''')
write_fmt(f, args, remap, '''\
| remap = [{remapmin}, {remapmax}],
''')
if dups or rawdups:
write_fmt(f, args, '''\
| dups = [
''')
write_comma_separated(f, ' ', ['%s, ' %
v for v in rawdups] + ['%d, ' %
v for v in sorted(dups)])
write_fmt(f, args, '''\
| ]
''')
else:
write_fmt(f, args, '''\
| dups = []
''')
write_fmt(f, args, '''\
|}}
''')
forwardsz = 2 * (maxkey - minkey)
backwardsz = 2 * len(trielower) + 2 * len(trieupper)
backwardszslow = 2 * len(searchlower) + 4 * len(searchupper)
backwardmore = 0
if morebits:
backwardmore += 4 * ((maxkey - minkey + 31) // 32)
if remap:
backwardmore += 2 * len(remap)
return forwardsz, backwardsz + backwardmore, backwardszslow + backwardmore
def generate_multi_byte_range_lbound_index(opts, crate, name):
data = []
comments = []
for key, value in read_index(opts, crate, name, comments):
data.append((key, value))
assert data and data == sorted(data)
minkey, minvalue = data[0]
maxkey, maxvalue = data[-1]
if data[0] != (0, 0):
data.insert(0, (0, 0))
maxlog2 = 0
while 2**(maxlog2 + 1) <= len(data):
maxlog2 += 1
if name == 'gb18030-ranges':
keyubound = 0x110000
valueubound = 126 * 10 * 126 * 10
else:
keyubound = maxkey + 1
valueubound = maxvalue + 1
args = dict(
datasz=len(data),
minkey=minkey,
maxkey=maxkey,
keyubound=keyubound,
minvalue=minvalue,
maxvalue=maxvalue,
valueubound=valueubound,
)
with mkdir_and_open(crate, name) as f:
write_header(f, name, comments)
write_fmt(f, args, '''\
|
|const FORWARD_TABLE: &[u32] = &[
''')
write_comma_separated(f, ' ', ['%d, ' % value for key, value in data])
write_fmt(f, args, '''\
|]; // {datasz} entries
|
|const BACKWARD_TABLE: &[u32] = &[
''')
write_comma_separated(f, ' ', ['%d, ' % key for key, value in data])
write_fmt(f, args, '''\
|]; // {datasz} entries
|
|fn search(code: u32, fromtab: &'static [u32], totab: &'static [u32]) -> u32 {{
| let mut i = if code >= fromtab[{firstoff}] {{{firstdelta}}} else {{0}};
''',
firstoff=2**maxlog2 - 1,
firstdelta=len(data) - 2**maxlog2 + 1)
for i in range(maxlog2 - 1, -1, -1):
write_fmt(f, args, '''\
| if code >= fromtab[i{plusoff}] {{ i += {delta}; }}
''',
plusoff='+%d' % (2**i - 1) if i > 0 else '',
delta=2**i)
write_fmt(f, args, '''\
| (code - fromtab[i-1]) + totab[i-1]
|}}
|
|/// Returns the index code point for pointer `code` in this index.
|#[inline]
|pub fn forward(code: u32) -> u32 {{
''')
write_fmt(f, args, minkey > 0, '''\
| if code < {minkey} {{ return 0xffffffff; }}
''')
# GB 18030 has "invalid" region inside and a singular mapping
write_fmt(f, args, name == 'gb18030-ranges', '''\
| if (code > 39419 && code < 189000) || code > 1237575 {{ return 0xffffffff; }}
| if code == 7457 {{ return 0xe7c7; }}
''')
write_fmt(f, args, '''\
| search(code, BACKWARD_TABLE, FORWARD_TABLE)
|}}
|
|/// Returns the index pointer for code point `code` in this index.
|#[inline]
|pub fn backward(code: u32) -> u32 {{
''')
write_fmt(f, args, minvalue > 0, '''\
| if code < {minvalue} {{ return 0xffffffff; }}
''')
# GB 18030 has a singular mapping
write_fmt(f, args, name == 'gb18030-ranges', '''\
| if code == 0xe7c7 {{ return 7457; }}
''')
write_fmt(f, args, '''\
| search(code, FORWARD_TABLE, BACKWARD_TABLE)
|}}
|
|#[cfg(test)]
|encoding_index_tests::multi_byte_range_tests! {{
| key = [{minkey}, {maxkey}], key < {keyubound},
| value = [{minvalue}, {maxvalue}], value < {valueubound}
|}}
''')
forwardsz = 4 * len(data)
backwardsz = 4 * len(data)
return forwardsz, backwardsz, backwardsz
INDICES = [
('encoding-index-singlebyte/armscii-8', generate_single_byte_index),
('encoding-index-singlebyte/cp437', generate_single_byte_index),
('encoding-index-singlebyte/ibm866', generate_single_byte_index),
('encoding-index-singlebyte/iso-8859-2', generate_single_byte_index),
('encoding-index-singlebyte/iso-8859-3', generate_single_byte_index),
('encoding-index-singlebyte/iso-8859-4', generate_single_byte_index),
('encoding-index-singlebyte/iso-8859-5', generate_single_byte_index),
('encoding-index-singlebyte/iso-8859-6', generate_single_byte_index),
('encoding-index-singlebyte/iso-8859-7', generate_single_byte_index),
('encoding-index-singlebyte/iso-8859-8', generate_single_byte_index),
('encoding-index-singlebyte/iso-8859-10', generate_single_byte_index),
('encoding-index-singlebyte/iso-8859-13', generate_single_byte_index),
('encoding-index-singlebyte/iso-8859-14', generate_single_byte_index),
('encoding-index-singlebyte/iso-8859-15', generate_single_byte_index),
('encoding-index-singlebyte/iso-8859-16', generate_single_byte_index),
('encoding-index-singlebyte/koi8-r', generate_single_byte_index),
('encoding-index-singlebyte/koi8-u', generate_single_byte_index),
('encoding-index-singlebyte/macintosh', generate_single_byte_index),
('encoding-index-singlebyte/windows-874', generate_single_byte_index),
('encoding-index-singlebyte/windows-1250', generate_single_byte_index),
('encoding-index-singlebyte/windows-1251', generate_single_byte_index),
('encoding-index-singlebyte/windows-1252', generate_single_byte_index),
('encoding-index-singlebyte/windows-1253', generate_single_byte_index),
('encoding-index-singlebyte/windows-1254', generate_single_byte_index),
('encoding-index-singlebyte/windows-1255', generate_single_byte_index),
('encoding-index-singlebyte/windows-1256', generate_single_byte_index),
('encoding-index-singlebyte/windows-1257', generate_single_byte_index),
('encoding-index-singlebyte/windows-1258', generate_single_byte_index),
('encoding-index-singlebyte/x-mac-cyrillic', generate_single_byte_index),
('encoding-index-tradchinese/big5', generate_multi_byte_index),
('encoding-index-korean/euc-kr', generate_multi_byte_index),
('encoding-index-simpchinese/gb18030', generate_multi_byte_index),
('encoding-index-japanese/jis0208', generate_multi_byte_index),
('encoding-index-japanese/jis0212', generate_multi_byte_index),
('encoding-index-simpchinese/gb18030-ranges', generate_multi_byte_range_lbound_index),
]
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'-f', '--flush-cache', action='store_true',
help='flush the download cache'
)
parser.add_argument(
'--cache-dir',
default=Path(__file__).parent.resolve() / '.cache',
help='set the download cache directory [default: %(default)s]',
type=Path
)
parser.add_argument(
'--singlebyte', dest='func_filter', action='store_const',
const=generate_single_byte_index,
help='generate only single-byte indices'
)
parser.add_argument(
'--multibyte', dest='func_filter', action='store_const',
const=generate_multi_byte_index,
help='generate only multi-byte indices'
)
parser.add_argument(
'--max-backward-search-multibyte', type=lambda v: int(v, 0),
metavar='MAX_SEARCH', default='0x200',
help='set the max search limit of the unoptimized backward mapping '
'for multi-byte indices [default: %(default)s]'
)
parser.add_argument(
'--no-premapping', action='store_true',
help='disable premapping; trades table size for decoder performance'
)
parser.add_argument(
'filters', nargs='*', help='substring of indices to regenerate'
)
opts = parser.parse_args()
totalsz = totalszslow = 0
for index, generate in INDICES:
crate, __, index = index.partition('/')
if opts.filters and all(s not in index for s in opts.filters):
continue
if opts.func_filter and generate is not opts.func_filter:
continue
print('generating index %s...' % index, end='', file=sys.stderr)
write_license_file(crate)
forwardsz, backwardsz, backwardszslow = generate(opts, crate, index)
totalsz += forwardsz + backwardsz
totalszslow += forwardsz + backwardszslow
print('%d + %d (%d) = %d (%d) bytes.' %
(forwardsz, backwardsz, backwardszslow,
forwardsz + backwardsz, forwardsz + backwardszslow), file=sys.stderr)
print('total %d (%d) bytes.' % (totalsz, totalszslow), file=sys.stderr)
if __name__ == '__main__':
main()