/* * Various Unicode help functions for character classification predicates, * case conversion, decoding, etc. */ #include "duk_internal.h" /* * Fast path tables */ #if defined(DUK_USE_IDCHAR_FASTPATH) DUK_INTERNAL const duk_int8_t duk_is_idchar_tab[128] = { /* 0: not IdentifierStart or IdentifierPart * 1: IdentifierStart and IdentifierPart * -1: IdentifierPart only */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00...0x0f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10...0x1f */ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20...0x2f */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, /* 0x30...0x3f */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40...0x4f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 0x50...0x5f */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60...0x6f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 /* 0x70...0x7f */ }; #endif /* * XUTF-8 and CESU-8 encoding/decoding */ DUK_INTERNAL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp) { duk_uint_fast32_t x = (duk_uint_fast32_t) cp; if (x < 0x80UL) { /* 7 bits */ return 1; } else if (x < 0x800UL) { /* 11 bits */ return 2; } else if (x < 0x10000UL) { /* 16 bits */ return 3; } else if (x < 0x200000UL) { /* 21 bits */ return 4; } else if (x < 0x4000000UL) { /* 26 bits */ return 5; } else if (x < (duk_ucodepoint_t) 0x80000000UL) { /* 31 bits */ return 6; } else { /* 36 bits */ return 7; } } #if defined(DUK_USE_ASSERTIONS) DUK_INTERNAL duk_small_int_t duk_unicode_get_cesu8_length(duk_ucodepoint_t cp) { duk_uint_fast32_t x = (duk_uint_fast32_t) cp; if (x < 0x80UL) { /* 7 bits */ return 1; } else if (x < 0x800UL) { /* 11 bits */ return 2; } else if (x < 0x10000UL) { /* 16 bits */ return 3; } else { /* Encoded as surrogate pair, each encoding to 3 bytes for * 6 bytes total. Codepoints above U+10FFFF encode as 6 bytes * too, see duk_unicode_encode_cesu8(). */ return 3 + 3; } } #endif /* DUK_USE_ASSERTIONS */ DUK_INTERNAL const duk_uint8_t duk_unicode_xutf8_markers[7] = { 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe }; /* Encode to extended UTF-8; 'out' must have space for at least * DUK_UNICODE_MAX_XUTF8_LENGTH bytes. Allows encoding of any * 32-bit (unsigned) codepoint. */ DUK_INTERNAL duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out) { duk_uint_fast32_t x = (duk_uint_fast32_t) cp; duk_small_int_t len; duk_uint8_t marker; duk_small_int_t i; len = duk_unicode_get_xutf8_length(cp); DUK_ASSERT(len > 0); marker = duk_unicode_xutf8_markers[len - 1]; /* 64-bit OK because always >= 0 */ i = len; DUK_ASSERT(i > 0); do { i--; if (i > 0) { out[i] = (duk_uint8_t) (0x80 + (x & 0x3f)); x >>= 6; } else { /* Note: masking of 'x' is not necessary because of * range check and shifting -> no bits overlapping * the marker should be set. */ out[0] = (duk_uint8_t) (marker + x); } } while (i > 0); return len; } /* Encode to CESU-8; 'out' must have space for at least * DUK_UNICODE_MAX_CESU8_LENGTH bytes; codepoints above U+10FFFF * will encode to garbage but won't overwrite the output buffer. */ DUK_INTERNAL duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out) { duk_uint_fast32_t x = (duk_uint_fast32_t) cp; duk_small_int_t len; if (x < 0x80UL) { out[0] = (duk_uint8_t) x; len = 1; } else if (x < 0x800UL) { out[0] = (duk_uint8_t) (0xc0 + ((x >> 6) & 0x1f)); out[1] = (duk_uint8_t) (0x80 + (x & 0x3f)); len = 2; } else if (x < 0x10000UL) { /* surrogate pairs get encoded here */ out[0] = (duk_uint8_t) (0xe0 + ((x >> 12) & 0x0f)); out[1] = (duk_uint8_t) (0x80 + ((x >> 6) & 0x3f)); out[2] = (duk_uint8_t) (0x80 + (x & 0x3f)); len = 3; } else { /* * Unicode codepoints above U+FFFF are encoded as surrogate * pairs here. This ensures that all CESU-8 codepoints are * 16-bit values as expected in ECMAScript. The surrogate * pairs always get a 3-byte encoding (each) in CESU-8. * See: http://en.wikipedia.org/wiki/Surrogate_pair * * 20-bit codepoint, 10 bits (A and B) per surrogate pair: * * x = 0b00000000 0000AAAA AAAAAABB BBBBBBBB * sp1 = 0b110110AA AAAAAAAA (0xd800 + ((x >> 10) & 0x3ff)) * sp2 = 0b110111BB BBBBBBBB (0xdc00 + (x & 0x3ff)) * * Encoded into CESU-8: * * sp1 -> 0b11101101 (0xe0 + ((sp1 >> 12) & 0x0f)) * -> 0b1010AAAA (0x80 + ((sp1 >> 6) & 0x3f)) * -> 0b10AAAAAA (0x80 + (sp1 & 0x3f)) * sp2 -> 0b11101101 (0xe0 + ((sp2 >> 12) & 0x0f)) * -> 0b1011BBBB (0x80 + ((sp2 >> 6) & 0x3f)) * -> 0b10BBBBBB (0x80 + (sp2 & 0x3f)) * * Note that 0x10000 must be subtracted first. The code below * avoids the sp1, sp2 temporaries which saves around 20 bytes * of code. */ x -= 0x10000UL; out[0] = (duk_uint8_t) (0xed); out[1] = (duk_uint8_t) (0xa0 + ((x >> 16) & 0x0f)); out[2] = (duk_uint8_t) (0x80 + ((x >> 10) & 0x3f)); out[3] = (duk_uint8_t) (0xed); out[4] = (duk_uint8_t) (0xb0 + ((x >> 6) & 0x0f)); out[5] = (duk_uint8_t) (0x80 + (x & 0x3f)); len = 6; } return len; } /* Decode helper. Return zero on error. */ DUK_INTERNAL duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp) { const duk_uint8_t *p; duk_uint32_t res; duk_uint_fast8_t ch; duk_small_int_t n; DUK_UNREF(thr); p = *ptr; if (p < ptr_start || p >= ptr_end) { goto fail; } /* * UTF-8 decoder which accepts longer than standard byte sequences. * This allows full 32-bit code points to be used. */ ch = (duk_uint_fast8_t) (*p++); if (ch < 0x80) { /* 0xxx xxxx [7 bits] */ res = (duk_uint32_t) (ch & 0x7f); n = 0; } else if (ch < 0xc0) { /* 10xx xxxx -> invalid */ goto fail; } else if (ch < 0xe0) { /* 110x xxxx 10xx xxxx [11 bits] */ res = (duk_uint32_t) (ch & 0x1f); n = 1; } else if (ch < 0xf0) { /* 1110 xxxx 10xx xxxx 10xx xxxx [16 bits] */ res = (duk_uint32_t) (ch & 0x0f); n = 2; } else if (ch < 0xf8) { /* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx [21 bits] */ res = (duk_uint32_t) (ch & 0x07); n = 3; } else if (ch < 0xfc) { /* 1111 10xx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [26 bits] */ res = (duk_uint32_t) (ch & 0x03); n = 4; } else if (ch < 0xfe) { /* 1111 110x 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [31 bits] */ res = (duk_uint32_t) (ch & 0x01); n = 5; } else if (ch < 0xff) { /* 1111 1110 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [36 bits] */ res = (duk_uint32_t) (0); n = 6; } else { /* 8-byte format could be: * 1111 1111 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [41 bits] * * However, this format would not have a zero bit following the * leading one bits and would not allow 0xFF to be used as an * "invalid xutf-8" marker for internal keys. Further, 8-byte * encodings (up to 41 bit code points) are not currently needed. */ goto fail; } DUK_ASSERT(p >= ptr_start); /* verified at beginning */ if (p + n > ptr_end) { /* check pointer at end */ goto fail; } while (n > 0) { DUK_ASSERT(p >= ptr_start && p < ptr_end); ch = (duk_uint_fast8_t) (*p++); #if 0 if (ch & 0xc0 != 0x80) { /* not a continuation byte */ p--; *ptr = p; *out_cp = DUK_UNICODE_CP_REPLACEMENT_CHARACTER; return 1; } #endif res = (res << 6) + (duk_uint32_t) (ch & 0x3f); n--; } *ptr = p; *out_cp = res; return 1; fail: return 0; } /* used by e.g. duk_regexp_executor.c, string built-ins */ DUK_INTERNAL duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end) { duk_ucodepoint_t cp; if (duk_unicode_decode_xutf8(thr, ptr, ptr_start, ptr_end, &cp)) { return cp; } DUK_ERROR_INTERNAL(thr); DUK_WO_NORETURN(return 0;); } /* Compute (extended) utf-8 length without codepoint encoding validation, * used for string interning. * * NOTE: This algorithm is performance critical, more so than string hashing * in some cases. It is needed when interning a string and needs to scan * every byte of the string with no skipping. Having an ASCII fast path * is useful if possible in the algorithm. The current algorithms were * chosen from several variants, based on x64 gcc -O2 testing. See: * https://github.com/svaarala/duktape/pull/422 * * NOTE: must match tools/dukutil.py:duk_unicode_unvalidated_utf8_length(). */ #if defined(DUK_USE_PREFER_SIZE) /* Small variant; roughly 150 bytes smaller than the fast variant. */ DUK_INTERNAL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen) { const duk_uint8_t *p; const duk_uint8_t *p_end; duk_size_t ncont; duk_size_t clen; p = data; p_end = data + blen; ncont = 0; while (p != p_end) { duk_uint8_t x; x = *p++; if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) { ncont++; } } DUK_ASSERT(ncont <= blen); clen = blen - ncont; DUK_ASSERT(clen <= blen); return clen; } #else /* DUK_USE_PREFER_SIZE */ /* This seems like a good overall approach. Fast path for ASCII in 4 byte * blocks. */ DUK_INTERNAL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen) { const duk_uint8_t *p; const duk_uint8_t *p_end; const duk_uint32_t *p32_end; const duk_uint32_t *p32; duk_size_t ncont; duk_size_t clen; ncont = 0; /* number of continuation (non-initial) bytes in [0x80,0xbf] */ p = data; p_end = data + blen; if (blen < 16) { goto skip_fastpath; } /* Align 'p' to 4; the input data may have arbitrary alignment. * End of string check not needed because blen >= 16. */ while (((duk_size_t) (const void *) p) & 0x03U) { duk_uint8_t x; x = *p++; if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) { ncont++; } } /* Full, aligned 4-byte reads. */ p32_end = (const duk_uint32_t *) (const void *) (p + ((duk_size_t) (p_end - p) & (duk_size_t) (~0x03))); p32 = (const duk_uint32_t *) (const void *) p; while (p32 != (const duk_uint32_t *) p32_end) { duk_uint32_t x; x = *p32++; if (DUK_LIKELY((x & 0x80808080UL) == 0)) { ; /* ASCII fast path */ } else { /* Flip highest bit of each byte which changes * the bit pattern 10xxxxxx into 00xxxxxx which * allows an easy bit mask test. */ x ^= 0x80808080UL; if (DUK_UNLIKELY(!(x & 0xc0000000UL))) { ncont++; } if (DUK_UNLIKELY(!(x & 0x00c00000UL))) { ncont++; } if (DUK_UNLIKELY(!(x & 0x0000c000UL))) { ncont++; } if (DUK_UNLIKELY(!(x & 0x000000c0UL))) { ncont++; } } } p = (const duk_uint8_t *) p32; /* Fall through to handle the rest. */ skip_fastpath: while (p != p_end) { duk_uint8_t x; x = *p++; if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) { ncont++; } } DUK_ASSERT(ncont <= blen); clen = blen - ncont; DUK_ASSERT(clen <= blen); return clen; } #endif /* DUK_USE_PREFER_SIZE */ /* * Unicode range matcher * * Matches a codepoint against a packed bitstream of character ranges. * Used for slow path Unicode matching. */ /* Must match tools/extract_chars.py, generate_match_table3(). */ DUK_LOCAL duk_uint32_t duk__uni_decode_value(duk_bitdecoder_ctx *bd_ctx) { duk_uint32_t t; t = (duk_uint32_t) duk_bd_decode(bd_ctx, 4); if (t <= 0x0eU) { return t; } t = (duk_uint32_t) duk_bd_decode(bd_ctx, 8); if (t <= 0xfdU) { return t + 0x0f; } if (t == 0xfeU) { t = (duk_uint32_t) duk_bd_decode(bd_ctx, 12); return t + 0x0fU + 0xfeU; } else { t = (duk_uint32_t) duk_bd_decode(bd_ctx, 24); return t + 0x0fU + 0xfeU + 0x1000UL; } } DUK_LOCAL duk_small_int_t duk__uni_range_match(const duk_uint8_t *unitab, duk_size_t unilen, duk_codepoint_t cp) { duk_bitdecoder_ctx bd_ctx; duk_codepoint_t prev_re; duk_memzero(&bd_ctx, sizeof(bd_ctx)); bd_ctx.data = (const duk_uint8_t *) unitab; bd_ctx.length = (duk_size_t) unilen; prev_re = 0; for (;;) { duk_codepoint_t r1, r2; r1 = (duk_codepoint_t) duk__uni_decode_value(&bd_ctx); if (r1 == 0) { break; } r2 = (duk_codepoint_t) duk__uni_decode_value(&bd_ctx); r1 = prev_re + r1; r2 = r1 + r2; prev_re = r2; /* [r1,r2] is the range */ DUK_DDD(DUK_DDDPRINT("duk__uni_range_match: cp=%06lx range=[0x%06lx,0x%06lx]", (unsigned long) cp, (unsigned long) r1, (unsigned long) r2)); if (cp >= r1 && cp <= r2) { return 1; } } return 0; } /* * "WhiteSpace" production check. */ DUK_INTERNAL duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp) { /* * E5 Section 7.2 specifies six characters specifically as * white space: * * 0009;;Cc;0;S;;;;;N;CHARACTER TABULATION;;;; * 000B;;Cc;0;S;;;;;N;LINE TABULATION;;;; * 000C;;Cc;0;WS;;;;;N;FORM FEED (FF);;;; * 0020;SPACE;Zs;0;WS;;;;;N;;;;; * 00A0;NO-BREAK SPACE;Zs;0;CS; 0020;;;;N;NON-BREAKING SPACE;;;; * FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; * * It also specifies any Unicode category 'Zs' characters as white * space. These can be extracted with the "tools/extract_chars.py" script. * Current result: * * RAW OUTPUT: * =========== * 0020;SPACE;Zs;0;WS;;;;;N;;;;; * 00A0;NO-BREAK SPACE;Zs;0;CS; 0020;;;;N;NON-BREAKING SPACE;;;; * 1680;OGHAM SPACE MARK;Zs;0;WS;;;;;N;;;;; * 180E;MONGOLIAN VOWEL SEPARATOR;Zs;0;WS;;;;;N;;;;; * 2000;EN QUAD;Zs;0;WS;2002;;;;N;;;;; * 2001;EM QUAD;Zs;0;WS;2003;;;;N;;;;; * 2002;EN SPACE;Zs;0;WS; 0020;;;;N;;;;; * 2003;EM SPACE;Zs;0;WS; 0020;;;;N;;;;; * 2004;THREE-PER-EM SPACE;Zs;0;WS; 0020;;;;N;;;;; * 2005;FOUR-PER-EM SPACE;Zs;0;WS; 0020;;;;N;;;;; * 2006;SIX-PER-EM SPACE;Zs;0;WS; 0020;;;;N;;;;; * 2007;FIGURE SPACE;Zs;0;WS; 0020;;;;N;;;;; * 2008;PUNCTUATION SPACE;Zs;0;WS; 0020;;;;N;;;;; * 2009;THIN SPACE;Zs;0;WS; 0020;;;;N;;;;; * 200A;HAIR SPACE;Zs;0;WS; 0020;;;;N;;;;; * 202F;NARROW NO-BREAK SPACE;Zs;0;CS; 0020;;;;N;;;;; * 205F;MEDIUM MATHEMATICAL SPACE;Zs;0;WS; 0020;;;;N;;;;; * 3000;IDEOGRAPHIC SPACE;Zs;0;WS; 0020;;;;N;;;;; * * RANGES: * ======= * 0x0020 * 0x00a0 * 0x1680 * 0x180e * 0x2000 ... 0x200a * 0x202f * 0x205f * 0x3000 * * A manual decoder (below) is probably most compact for this. */ duk_uint_fast8_t lo; duk_uint_fast32_t hi; /* cp == -1 (EOF) never matches and causes return value 0 */ lo = (duk_uint_fast8_t) (cp & 0xff); hi = (duk_uint_fast32_t) (cp >> 8); /* does not fit into an uchar */ if (hi == 0x0000UL) { if (lo == 0x09U || lo == 0x0bU || lo == 0x0cU || lo == 0x20U || lo == 0xa0U) { return 1; } } else if (hi == 0x0020UL) { if (lo <= 0x0aU || lo == 0x2fU || lo == 0x5fU) { return 1; } } else if (cp == 0x1680L || cp == 0x180eL || cp == 0x3000L || cp == 0xfeffL) { return 1; } return 0; } /* * "LineTerminator" production check. */ DUK_INTERNAL duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp) { /* * E5 Section 7.3 * * A LineTerminatorSequence essentially merges sequences * into a single line terminator. This must be handled by the caller. */ if (cp == 0x000aL || cp == 0x000dL || cp == 0x2028L || cp == 0x2029L) { return 1; } return 0; } /* * "IdentifierStart" production check. */ DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp) { /* * E5 Section 7.6: * * IdentifierStart: * UnicodeLetter * $ * _ * \ UnicodeEscapeSequence * * IdentifierStart production has one multi-character production: * * \ UnicodeEscapeSequence * * The '\' character is -not- matched by this function. Rather, the caller * should decode the escape and then call this function to check whether the * decoded character is acceptable (see discussion in E5 Section 7.6). * * The "UnicodeLetter" alternative of the production allows letters * from various Unicode categories. These can be extracted with the * "tools/extract_chars.py" script. * * Because the result has hundreds of Unicode codepoint ranges, matching * for any values >= 0x80 are done using a very slow range-by-range scan * and a packed range format. * * The ASCII portion (codepoints 0x00 ... 0x7f) is fast-pathed below because * it matters the most. The ASCII related ranges of IdentifierStart are: * * 0x0041 ... 0x005a ['A' ... 'Z'] * 0x0061 ... 0x007a ['a' ... 'z'] * 0x0024 ['$'] * 0x005f ['_'] */ /* ASCII (and EOF) fast path -- quick accept and reject */ if (cp <= 0x7fL) { #if defined(DUK_USE_IDCHAR_FASTPATH) return (cp >= 0) && (duk_is_idchar_tab[cp] > 0); #else if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z') || cp == '_' || cp == '$') { return 1; } return 0; #endif } /* Non-ASCII slow path (range-by-range linear comparison), very slow */ #if defined(DUK_USE_SOURCE_NONBMP) if (duk__uni_range_match(duk_unicode_ids_noa, (duk_size_t) sizeof(duk_unicode_ids_noa), (duk_codepoint_t) cp)) { return 1; } return 0; #else if (cp < 0x10000L) { if (duk__uni_range_match(duk_unicode_ids_noabmp, sizeof(duk_unicode_ids_noabmp), (duk_codepoint_t) cp)) { return 1; } return 0; } else { /* without explicit non-BMP support, assume non-BMP characters * are always accepted as identifier characters. */ return 1; } #endif } /* * "IdentifierPart" production check. */ DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp) { /* * E5 Section 7.6: * * IdentifierPart: * IdentifierStart * UnicodeCombiningMark * UnicodeDigit * UnicodeConnectorPunctuation * [U+200C] * [U+200D] * * IdentifierPart production has one multi-character production * as part of its IdentifierStart alternative. The '\' character * of an escape sequence is not matched here, see discussion in * duk_unicode_is_identifier_start(). * * To match non-ASCII characters (codepoints >= 0x80), a very slow * linear range-by-range scan is used. The codepoint is first compared * to the IdentifierStart ranges, and if it doesn't match, then to a * set consisting of code points in IdentifierPart but not in * IdentifierStart. This is done to keep the unicode range data small, * at the expense of speed. * * The ASCII fast path consists of: * * 0x0030 ... 0x0039 ['0' ... '9', UnicodeDigit] * 0x0041 ... 0x005a ['A' ... 'Z', IdentifierStart] * 0x0061 ... 0x007a ['a' ... 'z', IdentifierStart] * 0x0024 ['$', IdentifierStart] * 0x005f ['_', IdentifierStart and * UnicodeConnectorPunctuation] * * UnicodeCombiningMark has no code points <= 0x7f. * * The matching code reuses the "identifier start" tables, and then * consults a separate range set for characters in "identifier part" * but not in "identifier start". These can be extracted with the * "tools/extract_chars.py" script. * * UnicodeCombiningMark -> categories Mn, Mc * UnicodeDigit -> categories Nd * UnicodeConnectorPunctuation -> categories Pc */ /* ASCII (and EOF) fast path -- quick accept and reject */ if (cp <= 0x7fL) { #if defined(DUK_USE_IDCHAR_FASTPATH) return (cp >= 0) && (duk_is_idchar_tab[cp] != 0); #else if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z') || (cp >= '0' && cp <= '9') || cp == '_' || cp == '$') { return 1; } return 0; #endif } /* Non-ASCII slow path (range-by-range linear comparison), very slow */ #if defined(DUK_USE_SOURCE_NONBMP) if (duk__uni_range_match(duk_unicode_ids_noa, sizeof(duk_unicode_ids_noa), (duk_codepoint_t) cp) || duk__uni_range_match(duk_unicode_idp_m_ids_noa, sizeof(duk_unicode_idp_m_ids_noa), (duk_codepoint_t) cp)) { return 1; } return 0; #else if (cp < 0x10000L) { if (duk__uni_range_match(duk_unicode_ids_noabmp, sizeof(duk_unicode_ids_noabmp), (duk_codepoint_t) cp) || duk__uni_range_match(duk_unicode_idp_m_ids_noabmp, sizeof(duk_unicode_idp_m_ids_noabmp), (duk_codepoint_t) cp)) { return 1; } return 0; } else { /* without explicit non-BMP support, assume non-BMP characters * are always accepted as identifier characters. */ return 1; } #endif } /* * Unicode letter check. */ DUK_INTERNAL duk_small_int_t duk_unicode_is_letter(duk_codepoint_t cp) { /* * Unicode letter is now taken to be the categories: * * Lu, Ll, Lt, Lm, Lo * * (Not sure if this is exactly correct.) * * The ASCII fast path consists of: * * 0x0041 ... 0x005a ['A' ... 'Z'] * 0x0061 ... 0x007a ['a' ... 'z'] */ /* ASCII (and EOF) fast path -- quick accept and reject */ if (cp <= 0x7fL) { if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z')) { return 1; } return 0; } /* Non-ASCII slow path (range-by-range linear comparison), very slow */ #if defined(DUK_USE_SOURCE_NONBMP) if (duk__uni_range_match(duk_unicode_ids_noa, sizeof(duk_unicode_ids_noa), (duk_codepoint_t) cp) && !duk__uni_range_match(duk_unicode_ids_m_let_noa, sizeof(duk_unicode_ids_m_let_noa), (duk_codepoint_t) cp)) { return 1; } return 0; #else if (cp < 0x10000L) { if (duk__uni_range_match(duk_unicode_ids_noabmp, sizeof(duk_unicode_ids_noabmp), (duk_codepoint_t) cp) && !duk__uni_range_match(duk_unicode_ids_m_let_noabmp, sizeof(duk_unicode_ids_m_let_noabmp), (duk_codepoint_t) cp)) { return 1; } return 0; } else { /* without explicit non-BMP support, assume non-BMP characters * are always accepted as letters. */ return 1; } #endif } /* * Complex case conversion helper which decodes a bit-packed conversion * control stream generated by tools/extract_caseconv.py. The conversion * is very slow because it runs through the conversion data in a linear * fashion to save space (which is why ASCII characters have a special * fast path before arriving here). * * The particular bit counts etc have been determined experimentally to * be small but still sufficient, and must match the Python script * (tools/extract_caseconv.py). * * The return value is the case converted codepoint or -1 if the conversion * results in multiple characters (this is useful for regexp Canonicalization * operation). If 'buf' is not NULL, the result codepoint(s) are also * appended to the hbuffer. * * Context and locale specific rules must be checked before consulting * this function. */ DUK_LOCAL duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr, duk_bufwriter_ctx *bw, duk_codepoint_t cp, duk_bitdecoder_ctx *bd_ctx) { duk_small_int_t skip = 0; duk_small_int_t n; duk_small_int_t t; duk_small_int_t count; duk_codepoint_t tmp_cp; duk_codepoint_t start_i; duk_codepoint_t start_o; DUK_ASSERT(bd_ctx != NULL); DUK_UNREF(thr); DUK_DDD(DUK_DDDPRINT("slow case conversion for codepoint: %ld", (long) cp)); /* range conversion with a "skip" */ DUK_DDD(DUK_DDDPRINT("checking ranges")); for (;;) { skip++; n = (duk_small_int_t) duk_bd_decode(bd_ctx, 6); if (n == 0x3f) { /* end marker */ break; } DUK_DDD(DUK_DDDPRINT("skip=%ld, n=%ld", (long) skip, (long) n)); while (n--) { start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16); start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16); count = (duk_small_int_t) duk_bd_decode(bd_ctx, 7); DUK_DDD(DUK_DDDPRINT("range: start_i=%ld, start_o=%ld, count=%ld, skip=%ld", (long) start_i, (long) start_o, (long) count, (long) skip)); if (cp >= start_i) { tmp_cp = cp - start_i; /* always >= 0 */ if (tmp_cp < (duk_codepoint_t) count * (duk_codepoint_t) skip && (tmp_cp % (duk_codepoint_t) skip) == 0) { DUK_DDD(DUK_DDDPRINT("range matches input codepoint")); cp = start_o + tmp_cp; goto single; } } } } /* 1:1 conversion */ n = (duk_small_int_t) duk_bd_decode(bd_ctx, 7); DUK_DDD(DUK_DDDPRINT("checking 1:1 conversions (count %ld)", (long) n)); while (n--) { start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16); start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16); DUK_DDD(DUK_DDDPRINT("1:1 conversion %ld -> %ld", (long) start_i, (long) start_o)); if (cp == start_i) { DUK_DDD(DUK_DDDPRINT("1:1 matches input codepoint")); cp = start_o; goto single; } } /* complex, multicharacter conversion */ n = (duk_small_int_t) duk_bd_decode(bd_ctx, 7); DUK_DDD(DUK_DDDPRINT("checking 1:n conversions (count %ld)", (long) n)); while (n--) { start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16); t = (duk_small_int_t) duk_bd_decode(bd_ctx, 2); DUK_DDD(DUK_DDDPRINT("1:n conversion %ld -> %ld chars", (long) start_i, (long) t)); if (cp == start_i) { DUK_DDD(DUK_DDDPRINT("1:n matches input codepoint")); if (bw != NULL) { while (t--) { tmp_cp = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16); DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) tmp_cp); } } return -1; } else { while (t--) { (void) duk_bd_decode(bd_ctx, 16); } } } /* default: no change */ DUK_DDD(DUK_DDDPRINT("no rule matches, output is same as input")); /* fall through */ single: if (bw != NULL) { DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp); } return cp; } /* * Case conversion helper, with context/local sensitivity. * For proper case conversion, one needs to know the character * and the preceding and following characters, as well as * locale/language. */ /* XXX: add 'language' argument when locale/language sensitive rule * support added. */ DUK_LOCAL duk_codepoint_t duk__case_transform_helper(duk_hthread *thr, duk_bufwriter_ctx *bw, duk_codepoint_t cp, duk_codepoint_t prev, duk_codepoint_t next, duk_bool_t uppercase) { duk_bitdecoder_ctx bd_ctx; /* fast path for ASCII */ if (cp < 0x80L) { /* XXX: there are language sensitive rules for the ASCII range. * If/when language/locale support is implemented, they need to * be implemented here for the fast path. There are no context * sensitive rules for ASCII range. */ if (uppercase) { if (cp >= 'a' && cp <= 'z') { cp = cp - 'a' + 'A'; } } else { if (cp >= 'A' && cp <= 'Z') { cp = cp - 'A' + 'a'; } } if (bw != NULL) { DUK_BW_WRITE_RAW_U8(thr, bw, (duk_uint8_t) cp); } return cp; } /* context and locale specific rules which cannot currently be represented * in the caseconv bitstream: hardcoded rules in C */ if (uppercase) { /* XXX: turkish / azeri */ } else { /* * Final sigma context specific rule. This is a rather tricky * rule and this handling is probably not 100% correct now. * The rule is not locale/language specific so it is supported. */ if (cp == 0x03a3L && /* U+03A3 = GREEK CAPITAL LETTER SIGMA */ duk_unicode_is_letter(prev) && /* prev exists and is not a letter */ !duk_unicode_is_letter(next)) { /* next does not exist or next is not a letter */ /* Capital sigma occurred at "end of word", lowercase to * U+03C2 = GREEK SMALL LETTER FINAL SIGMA. Otherwise * fall through and let the normal rules lowercase it to * U+03C3 = GREEK SMALL LETTER SIGMA. */ cp = 0x03c2L; goto singlechar; } /* XXX: lithuanian not implemented */ /* XXX: lithuanian, explicit dot rules */ /* XXX: turkish / azeri, lowercase rules */ } /* 1:1 or special conversions, but not locale/context specific: script generated rules */ duk_memzero(&bd_ctx, sizeof(bd_ctx)); if (uppercase) { bd_ctx.data = (const duk_uint8_t *) duk_unicode_caseconv_uc; bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_uc); } else { bd_ctx.data = (const duk_uint8_t *) duk_unicode_caseconv_lc; bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_lc); } return duk__slow_case_conversion(thr, bw, cp, &bd_ctx); singlechar: if (bw != NULL) { DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp); } return cp; /* unused now, not needed until Turkish/Azeri */ #if 0 nochar: return -1; #endif } /* * Replace valstack top with case converted version. */ DUK_INTERNAL void duk_unicode_case_convert_string(duk_hthread *thr, duk_bool_t uppercase) { duk_hstring *h_input; duk_bufwriter_ctx bw_alloc; duk_bufwriter_ctx *bw; const duk_uint8_t *p, *p_start, *p_end; duk_codepoint_t prev, curr, next; h_input = duk_require_hstring(thr, -1); /* Accept symbols. */ DUK_ASSERT(h_input != NULL); bw = &bw_alloc; DUK_BW_INIT_PUSHBUF(thr, bw, DUK_HSTRING_GET_BYTELEN(h_input)); /* [ ... input buffer ] */ p_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_input); p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_input); p = p_start; prev = -1; DUK_UNREF(prev); curr = -1; next = -1; for (;;) { prev = curr; curr = next; next = -1; if (p < p_end) { next = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p, p_start, p_end); } else { /* end of input and last char has been processed */ if (curr < 0) { break; } } /* on first round, skip */ if (curr >= 0) { /* XXX: could add a fast path to process chunks of input codepoints, * but relative benefit would be quite small. */ /* Ensure space for maximum multi-character result; estimate is overkill. */ DUK_BW_ENSURE(thr, bw, 8 * DUK_UNICODE_MAX_XUTF8_LENGTH); duk__case_transform_helper(thr, bw, (duk_codepoint_t) curr, prev, next, uppercase); } } DUK_BW_COMPACT(thr, bw); (void) duk_buffer_to_string(thr, -1); /* Safe, output is encoded. */ /* invalidates h_buf pointer */ duk_remove_m2(thr); } #if defined(DUK_USE_REGEXP_SUPPORT) /* * Canonicalize() abstract operation needed for canonicalization of individual * codepoints during regexp compilation and execution, see E5 Section 15.10.2.8. * Note that codepoints are canonicalized one character at a time, so no context * specific rules can apply. Locale specific rules can apply, though. */ DUK_INTERNAL duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp) { #if defined(DUK_USE_REGEXP_CANON_WORKAROUND) /* Fast canonicalization lookup at the cost of 128kB footprint. */ DUK_ASSERT(cp >= 0); DUK_UNREF(thr); if (DUK_LIKELY(cp < 0x10000L)) { return (duk_codepoint_t) duk_unicode_re_canon_lookup[cp]; } return cp; #else /* DUK_USE_REGEXP_CANON_WORKAROUND */ duk_codepoint_t y; y = duk__case_transform_helper(thr, NULL, /* NULL is allowed, no output */ cp, /* curr char */ -1, /* prev char */ -1, /* next char */ 1); /* uppercase */ if ((y < 0) || (cp >= 0x80 && y < 0x80)) { /* multiple codepoint conversion or non-ASCII mapped to ASCII * --> leave as is. */ return cp; } return y; #endif /* DUK_USE_REGEXP_CANON_WORKAROUND */ } /* * E5 Section 15.10.2.6 "IsWordChar" abstract operation. Assume * x < 0 for characters read outside the string. */ DUK_INTERNAL duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t x) { /* * Note: the description in E5 Section 15.10.2.6 has a typo, it * contains 'A' twice and lacks 'a'; the intent is [0-9a-zA-Z_]. */ if ((x >= '0' && x <= '9') || (x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') || (x == '_')) { return 1; } return 0; } /* * Regexp range tables */ /* exposed because lexer needs these too */ DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_digit[2] = { (duk_uint16_t) 0x0030UL, (duk_uint16_t) 0x0039UL, }; DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_white[22] = { (duk_uint16_t) 0x0009UL, (duk_uint16_t) 0x000DUL, (duk_uint16_t) 0x0020UL, (duk_uint16_t) 0x0020UL, (duk_uint16_t) 0x00A0UL, (duk_uint16_t) 0x00A0UL, (duk_uint16_t) 0x1680UL, (duk_uint16_t) 0x1680UL, (duk_uint16_t) 0x180EUL, (duk_uint16_t) 0x180EUL, (duk_uint16_t) 0x2000UL, (duk_uint16_t) 0x200AUL, (duk_uint16_t) 0x2028UL, (duk_uint16_t) 0x2029UL, (duk_uint16_t) 0x202FUL, (duk_uint16_t) 0x202FUL, (duk_uint16_t) 0x205FUL, (duk_uint16_t) 0x205FUL, (duk_uint16_t) 0x3000UL, (duk_uint16_t) 0x3000UL, (duk_uint16_t) 0xFEFFUL, (duk_uint16_t) 0xFEFFUL, }; DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_wordchar[8] = { (duk_uint16_t) 0x0030UL, (duk_uint16_t) 0x0039UL, (duk_uint16_t) 0x0041UL, (duk_uint16_t) 0x005AUL, (duk_uint16_t) 0x005FUL, (duk_uint16_t) 0x005FUL, (duk_uint16_t) 0x0061UL, (duk_uint16_t) 0x007AUL, }; DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_digit[4] = { (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x002FUL, (duk_uint16_t) 0x003AUL, (duk_uint16_t) 0xFFFFUL, }; DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_white[24] = { (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x0008UL, (duk_uint16_t) 0x000EUL, (duk_uint16_t) 0x001FUL, (duk_uint16_t) 0x0021UL, (duk_uint16_t) 0x009FUL, (duk_uint16_t) 0x00A1UL, (duk_uint16_t) 0x167FUL, (duk_uint16_t) 0x1681UL, (duk_uint16_t) 0x180DUL, (duk_uint16_t) 0x180FUL, (duk_uint16_t) 0x1FFFUL, (duk_uint16_t) 0x200BUL, (duk_uint16_t) 0x2027UL, (duk_uint16_t) 0x202AUL, (duk_uint16_t) 0x202EUL, (duk_uint16_t) 0x2030UL, (duk_uint16_t) 0x205EUL, (duk_uint16_t) 0x2060UL, (duk_uint16_t) 0x2FFFUL, (duk_uint16_t) 0x3001UL, (duk_uint16_t) 0xFEFEUL, (duk_uint16_t) 0xFF00UL, (duk_uint16_t) 0xFFFFUL, }; DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_wordchar[10] = { (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x002FUL, (duk_uint16_t) 0x003AUL, (duk_uint16_t) 0x0040UL, (duk_uint16_t) 0x005BUL, (duk_uint16_t) 0x005EUL, (duk_uint16_t) 0x0060UL, (duk_uint16_t) 0x0060UL, (duk_uint16_t) 0x007BUL, (duk_uint16_t) 0xFFFFUL, }; #endif /* DUK_USE_REGEXP_SUPPORT */