#!/bin/sh # Copyright 2014 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. # References: # https://encoding.spec.whatwg.org/#euc-jp # https://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932 # https://www.iana.org/assignments/charset-reg/CP51932 # Table 3-64 in CJKV Information Processing 2/e. # Download the following two files, run it in source/data/mappings directory # and save the result to euc-jp-html5.ucm # https://encoding.spec.whatwg.org/index-jis0208.txt # https://encoding.spec.whatwg.org/index-jis0212.txt function preamble { cat < "euc-jp-html" "AXXXX" 3 1 "MBCS" \x3F "ASCII" 0-7f, 8e:2, 8f:3, a1-fe:1 a1-fe a1-df a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4, f4-fe:4 a1-fe.u CHARMAP PREAMBLE } # \x00 |0 function ascii { for i in $(seq 0 127) do printf ' \\x%02X |0\n' $i $i done } # Map 0x8E 0x[A1-DF] to U+FF61 to U+FF9F function half_width_kana { for i in $(seq 0xA1 0xDF) do # 65377 = 0xFF61, 161 = 0xA1 printf ' \\x8E\\x%02X |0\n' $(($i + 65377 - 161)) $i done } # index-jis0208.txt has index pointers larger than the size of # the encoding space available in 2-byte Graphic plane of ISO-2022-based # encoding (94 x 94 = 8836). We have to exclude them because they're for # Shift-JIS. # In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries. # All the bi-directional mapping entries come *before* the uni-directional # (EUC-JP to Unicode) entries so that we put '|3' if we have seen # the same Unicode code point earlier in the list. According to the definition # of 'index pointer' in the W3C encoding spec, it's the first entry in the # file for a given Unicode code point. function jis208 { awk '!/^#/ && !/^$/ && $1 <= 8836 \ { printf (" \\x%02X\\x%02X |%d\n", substr($2, 3),\ $1 / 94 + 0xA1, $1 % 94 + 0xA1,\ ($2 in uset) ? 3 : 0); \ uset[$2] = 1; }' \ index-jis0208.txt } # JIS X 212 is for decoding only (use '|3' to denote that). function jis212 { awk '!/^#/ && !/^$/ \ { printf (" \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\ $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \ index-jis0212.txt } function unsorted_table { ascii half_width_kana jis208 jis212 echo ' \x5C |1' echo ' \x7E |1' echo ' \xA1\xDD |1' } wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0208.txt wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0212.txt preamble unsorted_table | sort | uniq echo 'END CHARMAP'