// Copyright Mozilla Foundation. See the COPYRIGHT // file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. #ifndef encoding_rs_mem_h_ #define encoding_rs_mem_h_ #include #include #include /* * _Note:_ "Latin1" in this header refers to the Unicode range from U+0000 to * U+00FF, inclusive, and does not refer to the windows-1252 range. This * in-memory encoding is sometimes used as a storage optimization of text * when UTF-16 indexing and length semantics are exposed. */ /** * Classification of text as Latin1 (all code points are below U+0100), * left-to-right with some non-Latin1 characters or as containing at least * some right-to-left characters. */ typedef enum { /** * Every character is below U+0100. */ Latin1 = 0, /** * There is at least one character that's U+0100 or higher, but there * are no right-to-left characters. */ LeftToRight = 1, /** * There is at least one right-to-left character. */ Bidi = 2, } Latin1Bidi; #ifdef __cplusplus extern "C" { #endif // __cplusplus /** * Checks whether a valid UTF-8 buffer contains code points * that trigger right-to-left processing or is all-Latin1. * * Possibly more efficient than performing the checks separately. * * Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`. * Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, * if `buffer` is `NULL`, or if the memory designated by `buffer` and * `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` * may be bogus but still has to be non-`NULL`.) */ Latin1Bidi encoding_mem_check_str_for_latin1_and_bidi(const char* buffer, size_t len); /** * Checks whether a potentially invalid UTF-16 buffer contains code points * that trigger right-to-left processing or is all-Latin1. * * Possibly more efficient than performing the checks separately. * * Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`. * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but * still has to be non-`NULL` and aligned.) */ Latin1Bidi encoding_mem_check_utf16_for_latin1_and_bidi(const char16_t* buffer, size_t len); /** * Checks whether a potentially invalid UTF-8 buffer contains code points * that trigger right-to-left processing or is all-Latin1. * * Possibly more efficient than performing the checks separately. * * Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`. * * Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return * `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but * still has to be non-`NULL`.) */ Latin1Bidi encoding_mem_check_utf8_for_latin1_and_bidi(const char* buffer, size_t len); /** * Converts bytes whose unsigned value is interpreted as Unicode code point * (i.e. U+0000 to U+00FF, inclusive) to UTF-16. * * The length of the destination buffer must be at least the length of the * source buffer. * * The number of `char16_t`s written equals the length of the source buffer. * * # Panics * * Panics if the destination buffer is shorter than stated above. * * # Undefined behavior * * UB ensues if `src` and `src_len` don't designate a valid memory block, if * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory * block, if `dst` is `NULL` or if the two memory blocks overlap. (If * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and * aligned. Likewise for `dst` and `dst_len`.) */ void encoding_mem_convert_latin1_to_utf16(const char* src, size_t src_len, char16_t* dst, size_t dst_len); /** * Converts bytes whose unsigned value is interpreted as Unicode code point * (i.e. U+0000 to U+00FF, inclusive) to UTF-8. * * The length of the destination buffer must be at least the length of the * source buffer times two. * * Returns the number of bytes written. * * # Panics * * Panics if the destination buffer is shorter than stated above. * * # Safety * * Note that this function may write garbage beyond the number of bytes * indicated by the return value. * * # Undefined behavior * * UB ensues if `src` and `src_len` don't designate a valid memory block, if * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory * block, if `dst` is `NULL` or if the two memory blocks overlap. (If * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and * aligned. Likewise for `dst` and `dst_len`.) */ size_t encoding_mem_convert_latin1_to_utf8(const char* src, size_t src_len, char* dst, size_t dst_len); /** * Converts bytes whose unsigned value is interpreted as Unicode code point * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient * output space. * * Writes the number of code units read into `*src_len` and the number of * bytes written into `*dst_len`. * * If the output isn't large enough, not all input is consumed. * * # Undefined behavior * * UB ensues if `src` and `src_len` don't designate a valid memory block, if * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory * block, if `dst` is `NULL` or if the two memory blocks overlap. (If * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and * aligned. Likewise for `dst` and `dst_len`.) */ void encoding_mem_convert_latin1_to_utf8_partial(const char* src, size_t* src_len, char* dst, size_t* dst_len); /** * Converts valid UTF-8 to valid UTF-16. * * The length of the destination buffer must be at least the length of the * source buffer. * * Returns the number of `char16_t`s written. * * # Panics * * Panics if the destination buffer is shorter than stated above. * * # Undefined behavior * * UB ensues if `src` and `src_len` don't designate a valid memory block, if * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory * block, if `dst` is `NULL`, if the two memory blocks overlap, of if the * buffer designated by `src` and `src_len` does not contain valid UTF-8. (If * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and * aligned. Likewise for `dst` and `dst_len`.) */ size_t encoding_mem_convert_str_to_utf16(const char* src, size_t src_len, char16_t* dst, size_t dst_len); /** * If the input is valid UTF-16 representing only Unicode code points from * U+0000 to U+00FF, inclusive, converts the input into output that * represents the value of each code point as the unsigned byte value of * each output byte. * * If the input does not fulfill the condition stated above, does something * that is memory-safe without any promises about any properties of the * output and will probably assert in debug builds in future versions. * In particular, callers shouldn't assume the output to be the same across * crate versions or CPU architectures and should not assume that non-ASCII * input can't map to ASCII output. * * The length of the destination buffer must be at least the length of the * source buffer. * * The number of bytes written equals the length of the source buffer. * * # Panics * * Panics if the destination buffer is shorter than stated above. * (Probably in future versions if debug assertions are enabled (and not * fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.) * * # Undefined behavior * * UB ensues if `src` and `src_len` don't designate a valid memory block, if * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory * block, if `dst` is `NULL` or if the two memory blocks overlap. (If * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and * aligned. Likewise for `dst` and `dst_len`.) */ void encoding_mem_convert_utf16_to_latin1_lossy(const char16_t* src, size_t src_len, char* dst, size_t dst_len); /** * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced * with the REPLACEMENT CHARACTER. * * The length of the destination buffer must be at least the length of the * source buffer times three. * * Returns the number of bytes written. * * # Panics * * Panics if the destination buffer is shorter than stated above. * * # Undefined behavior * * UB ensues if `src` and `src_len` don't designate a valid memory block, if * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory * block, if `dst` is `NULL` or if the two memory blocks overlap. (If * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and * aligned. Likewise for `dst` and `dst_len`.) */ size_t encoding_mem_convert_utf16_to_utf8(const char16_t* src, size_t src_len, char* dst, size_t dst_len); /** * Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced * with the REPLACEMENT CHARACTER with potentially insufficient output * space. * * Writes the number of code units read into `*src_len` and the number of * bytes written into `*dst_len`. * * Guarantees that the bytes in the destination beyond the number of * bytes claimed as written by the second item of the return tuple * are left unmodified. * * Not all code units are read if there isn't enough output space. * Note that this method isn't designed for general streamability but for * not allocating memory for the worst case up front. Specifically, * if the input starts with or ends with an unpaired surrogate, those are * replaced with the REPLACEMENT CHARACTER. * * Matches the semantics of `TextEncoder.encodeInto()` from the * Encoding Standard. * * # Undefined behavior * * UB ensues if `src` and `src_len` don't designate a valid memory block, if * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory * block, if `dst` is `NULL` or if the two memory blocks overlap. (If * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and * aligned. Likewise for `dst` and `dst_len`.) */ void encoding_mem_convert_utf16_to_utf8_partial(const char16_t* src, size_t* src_len, char* dst, size_t* dst_len); /** * If the input is valid UTF-8 representing only Unicode code points from * U+0000 to U+00FF, inclusive, converts the input into output that * represents the value of each code point as the unsigned byte value of * each output byte. * * If the input does not fulfill the condition stated above, this function * panics if debug assertions are enabled (and fuzzing isn't) and otherwise * does something that is memory-safe without any promises about any * properties of the output. In particular, callers shouldn't assume the * output to be the same across crate versions or CPU architectures and * should not assume that non-ASCII input can't map to ASCII output. * The length of the destination buffer must be at least the length of the * source buffer. * * Returns the number of bytes written. * * # Panics * * Panics if the destination buffer is shorter than stated above. * If debug assertions are enabled (and not fuzzing) and the input is * not in the range U+0000 to U+00FF, inclusive. * * # Undefined behavior * * UB ensues if `src` and `src_len` don't designate a valid memory block, if * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory * block, if `dst` is `NULL` or if the two memory blocks overlap. (If * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and * aligned. Likewise for `dst` and `dst_len`.) */ size_t encoding_mem_convert_utf8_to_latin1_lossy(const char* src, size_t src_len, char* dst, size_t dst_len); /** * Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced * with the REPLACEMENT CHARACTER. * * The length of the destination buffer must be at least the length of the * source buffer _plus one_. * * Returns the number of `char16_t`s written. * * # Panics * * Panics if the destination buffer is shorter than stated above. * * # Undefined behavior * * UB ensues if `src` and `src_len` don't designate a valid memory block, if * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory * block, if `dst` is `NULL` or if the two memory blocks overlap. (If * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and * aligned. Likewise for `dst` and `dst_len`.) */ size_t encoding_mem_convert_utf8_to_utf16(const char* src, size_t src_len, char16_t* dst, size_t dst_len); /** * Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error. * * The length of the destination buffer must be at least the length of the * source buffer. * * Returns the number of `char16_t`s written or `SIZE_MAX` if the input was * invalid. * * When the input was invalid, some output may have been written. * * # Panics * * Panics if the destination buffer is shorter than stated above. * * # Undefined behavior * * UB ensues if `src` and `src_len` don't designate a valid memory block, if * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory * block, if `dst` is `NULL` or if the two memory blocks overlap. (If * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and * aligned. Likewise for `dst` and `dst_len`.) */ size_t encoding_mem_convert_utf8_to_utf16_without_replacement(const char* src, size_t src_len, char16_t* dst, size_t dst_len); /** * Copies ASCII from source to destination up to the first non-ASCII byte * (or the end of the input if it is ASCII in its entirety). * * The length of the destination buffer must be at least the length of the * source buffer. * * Returns the number of bytes written. * * # Panics * * Panics if the destination buffer is shorter than stated above. * * # Undefined behavior * * UB ensues if `src` and `src_len` don't designate a valid memory block, if * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory * block, if `dst` is `NULL` or if the two memory blocks overlap. (If * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and * aligned. Likewise for `dst` and `dst_len`.) */ size_t encoding_mem_copy_ascii_to_ascii(const char* src, size_t src_len, char* dst, size_t dst_len); /** * Copies ASCII from source to destination zero-extending it to UTF-16 up to * the first non-ASCII byte (or the end of the input if it is ASCII in its * entirety). * * The length of the destination buffer must be at least the length of the * source buffer. * * Returns the number of `char16_t`s written. * * # Panics * * Panics if the destination buffer is shorter than stated above. * * # Undefined behavior * * UB ensues if `src` and `src_len` don't designate a valid memory block, if * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory * block, if `dst` is `NULL` or if the two memory blocks overlap. (If * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and * aligned. Likewise for `dst` and `dst_len`.) */ size_t encoding_mem_copy_ascii_to_basic_latin(const char* src, size_t src_len, char16_t* dst, size_t dst_len); /** * Copies Basic Latin from source to destination narrowing it to ASCII up to * the first non-Basic Latin code unit (or the end of the input if it is * Basic Latin in its entirety). * * The length of the destination buffer must be at least the length of the * source buffer. * * Returns the number of bytes written. * * # Panics * * Panics if the destination buffer is shorter than stated above. * * # Undefined behavior * * UB ensues if `src` and `src_len` don't designate a valid memory block, if * `src` is `NULL`, if `dst` and `dst_len` don't designate a valid memory * block, if `dst` is `NULL` or if the two memory blocks overlap. (If * `src_len` is `0`, `src` may be bogus but still has to be non-`NULL` and * aligned. Likewise for `dst` and `dst_len`.) */ size_t encoding_mem_copy_basic_latin_to_ascii(const char16_t* src, size_t src_len, char* dst, size_t dst_len); /** * Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but * still has to be non-`NULL` and aligned.) */ void encoding_mem_ensure_utf16_validity(char16_t* buffer, size_t len); /** * Checks whether the buffer is all-ASCII. * * May read the entire buffer even if it isn't all-ASCII. (I.e. the function * is not guaranteed to fail fast.) * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but * still has to be non-`NULL`.) */ bool encoding_mem_is_ascii(const char* buffer, size_t len); /** * Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing * only ASCII characters). * * May read the entire buffer even if it isn't all-ASCII. (I.e. the function * is not guaranteed to fail fast.) * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but * still has to be non-`NULL` and aligned.) */ bool encoding_mem_is_basic_latin(const char16_t* buffer, size_t len); /** * Checks whether a scalar value triggers right-to-left processing. * * The check is done on a Unicode block basis without regard to assigned * vs. unassigned code points in the block. Hebrew presentation forms in * the Alphabetic Presentation Forms block are treated as if they formed * a block on their own (i.e. it treated as right-to-left). Additionally, * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked * for. Control characters that are technically bidi controls but do not * cause right-to-left behavior without the presence of right-to-left * characters or right-to-left controls are not checked for. As a special * case, U+FEFF is excluded from Arabic Presentation Forms-B. * * # Undefined behavior * * Undefined behavior ensues if `c` is not a valid Unicode Scalar Value. */ bool encoding_mem_is_char_bidi(char32_t c); /** * Checks whether a valid UTF-8 buffer contains code points that trigger * right-to-left processing. * * The check is done on a Unicode block basis without regard to assigned * vs. unassigned code points in the block. Hebrew presentation forms in * the Alphabetic Presentation Forms block are treated as if they formed * a block on their own (i.e. it treated as right-to-left). Additionally, * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked * for. Control characters that are technically bidi controls but do not * cause right-to-left behavior without the presence of right-to-left * characters or right-to-left controls are not checked for. As a special * case, U+FEFF is excluded from Arabic Presentation Forms-B. * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, * if `buffer` is `NULL`, or if the memory designated by `buffer` and * `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` * may be bogus but still has to be non-`NULL`.) */ bool encoding_mem_is_str_bidi(const char* buffer, size_t len); /** * Checks whether the buffer represents only code points less than or equal * to U+00FF. * * Fails fast. (I.e. returns before having read the whole buffer if code * points above U+00FF are discovered. * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, * if `buffer` is `NULL`, or if the memory designated by `buffer` and * `buffer_len` does not contain valid UTF-8. (If `buffer_len` is `0`, `buffer` * may be bogus but still has to be non-`NULL`.) */ bool encoding_mem_is_str_latin1(const char* buffer, size_t len); /** * Checks whether a UTF-16 buffer contains code points that trigger * right-to-left processing. * * The check is done on a Unicode block basis without regard to assigned * vs. unassigned code points in the block. Hebrew presentation forms in * the Alphabetic Presentation Forms block are treated as if they formed * a block on their own (i.e. it treated as right-to-left). Additionally, * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked * for. Control characters that are technically bidi controls but do not * cause right-to-left behavior without the presence of right-to-left * characters or right-to-left controls are not checked for. As a special * case, U+FEFF is excluded from Arabic Presentation Forms-B. * Returns `true` if the input contains an RTL character or an unpaired * high surrogate that could be the high half of an RTL character. * Returns `false` if the input contains neither RTL characters nor * unpaired high surrogates that could be higher halves of RTL characters. * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but * still has to be non-`NULL` and aligned.) */ bool encoding_mem_is_utf16_bidi(const char16_t* buffer, size_t len); /** * Checks whether a UTF-16 code unit triggers right-to-left processing. * * The check is done on a Unicode block basis without regard to assigned * vs. unassigned code points in the block. Hebrew presentation forms in * the Alphabetic Presentation Forms block are treated as if they formed * a block on their own (i.e. it treated as right-to-left). Additionally, * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked * for. Control characters that are technically bidi controls but do not * cause right-to-left behavior without the presence of right-to-left * characters or right-to-left controls are not checked for. As a special * case, U+FEFF is excluded from Arabic Presentation Forms-B. * Since supplementary-plane right-to-left blocks are identifiable from the * high surrogate without examining the low surrogate, this function returns * `true` for such high surrogates making the function suitable for handling * supplementary-plane text without decoding surrogate pairs to scalar * values. Obviously, such high surrogates are then reported as right-to-left * even if actually unpaired. */ bool encoding_mem_is_utf16_code_unit_bidi(char16_t u); /** * Checks whether the buffer represents only code point less than or equal * to U+00FF. * * May read the entire buffer even if it isn't all-Latin1. (I.e. the function * is not guaranteed to fail fast.) * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but * still has to be non-`NULL` and aligned.) */ bool encoding_mem_is_utf16_latin1(const char16_t* buffer, size_t len); /** * Checks whether a potentially-invalid UTF-8 buffer contains code points * that trigger right-to-left processing. * * The check is done on a Unicode block basis without regard to assigned * vs. unassigned code points in the block. Hebrew presentation forms in * the Alphabetic Presentation Forms block are treated as if they formed * a block on their own (i.e. it treated as right-to-left). Additionally, * the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked * for. Control characters that are technically bidi controls but do not * cause right-to-left behavior without the presence of right-to-left * characters or right-to-left controls are not checked for. As a special * case, U+FEFF is excluded from Arabic Presentation Forms-B. * Returns `true` if the input is invalid UTF-8 or the input contains an * RTL character. Returns `false` if the input is valid UTF-8 and contains * no RTL characters. * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but * still has to be non-`NULL`.) */ bool encoding_mem_is_utf8_bidi(const char* buffer, size_t len); /** * Checks whether the buffer is valid UTF-8 representing only code points * less than or equal to U+00FF. * * Fails fast. (I.e. returns before having read the whole buffer if UTF-8 * invalidity or code points above U+00FF are discovered. * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but * still has to be non-`NULL`.) */ bool encoding_mem_is_utf8_latin1(const char* buffer, size_t len); /** * Returns the index of the first unpaired surrogate or, if the input is * valid UTF-16 in its entirety, the length of the input. * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but * still has to be non-`NULL` and aligned.) */ size_t encoding_mem_utf16_valid_up_to(const char16_t* buffer, size_t len); /** * Returns the index of first byte that starts an invalid byte * sequence or a non-Latin1 byte sequence, or the length of the * string if there are neither. * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block * or if `buffer` is `NULL`. (If `buffer_len` is `0`, `buffer` may be bogus but * still has to be non-`NULL` and aligned.) */ size_t encoding_mem_utf8_latin1_up_to(const char* buffer, size_t len); /** * Returns the index of first byte that starts a non-Latin1 byte * sequence, or the length of the string if there are none. * * # Undefined behavior * * UB ensues if `buffer` and `buffer_len` don't designate a valid memory block, * if `buffer` is `NULL`, or if the memory block does not contain valid UTF-8. * (If `buffer_len` is `0`, `buffer` may be bogus but still has to be non-`NULL` * and aligned.) */ size_t encoding_mem_str_latin1_up_to(const char* buffer, size_t len); #ifdef __cplusplus } // extern "C" #endif // __cplusplus #endif // encoding_rs_mem_h_