/* * Copyright (C) 2020 Siara Logics (cc) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * @author Arundale Ramanathan * */ /** * @file unishox2.h * @author Arundale Ramanathan, James Z. M. Gao * @brief API for Unishox2 Compression and Decompression * * This file describes each function of the Unishox2 API \n * For finding out how this API can be used in your program, \n * please see test_unishox2.c. */ #ifndef unishox2 #define unishox2 #define UNISHOX_VERSION "2.0" ///< Unicode spec version /** * Macro switch to enable/disable output buffer length parameter in low level api \n * Disabled by default \n * When this macro is defined, the all the API functions \n * except the simple API functions accept an additional parameter olen \n * that enables the developer to pass the size of the output buffer provided \n * so that the api function may not write beyond that length. \n * This can be disabled if the developer knows that the buffer provided is sufficient enough \n * so no additional parameter is passed and the program is faster since additional check \n * for output length is not performed at each step \n * The simple api, i.e. unishox2_(de)compress_simple will always omit the buffer length */ #ifndef UNISHOX_API_WITH_OUTPUT_LEN # define UNISHOX_API_WITH_OUTPUT_LEN 0 #endif /// Upto 8 bits of initial magic bit sequence can be included. Bit count can be specified with UNISHOX_MAGIC_BIT_LEN #ifndef UNISHOX_MAGIC_BITS # define UNISHOX_MAGIC_BITS 0xFF #endif /// Desired length of Magic bits defined by UNISHOX_MAGIC_BITS #ifdef UNISHOX_MAGIC_BIT_LEN # if UNISHOX_MAGIC_BIT_LEN < 0 || 9 <= UNISHOX_MAGIC_BIT_LEN # error "UNISHOX_MAGIC_BIT_LEN need between [0, 8)" # endif #else # define UNISHOX_MAGIC_BIT_LEN 1 #endif //enum {USX_ALPHA = 0, USX_SYM, USX_NUM, USX_DICT, USX_DELTA}; /// Default Horizontal codes. When composition of text is know beforehand, the other hcodes in this section can be used to achieve more compression. #define USX_HCODES_DFLT (const unsigned char[]) {0x00, 0x40, 0x80, 0xC0, 0xE0} /// Length of each default hcode #define USX_HCODE_LENS_DFLT (const unsigned char[]) {2, 2, 2, 3, 3} /// Horizontal codes preset for English Alphabet content only #define USX_HCODES_ALPHA_ONLY (const unsigned char[]) {0x00, 0x00, 0x00, 0x00, 0x00} /// Length of each Alpha only hcode #define USX_HCODE_LENS_ALPHA_ONLY (const unsigned char[]) {0, 0, 0, 0, 0} /// Horizontal codes preset for Alpha Numeric content only #define USX_HCODES_ALPHA_NUM_ONLY (const unsigned char[]) {0x00, 0x00, 0x80, 0x00, 0x00} /// Length of each Alpha numeric hcode #define USX_HCODE_LENS_ALPHA_NUM_ONLY (const unsigned char[]) {1, 0, 1, 0, 0} /// Horizontal codes preset for Alpha Numeric and Symbol content only #define USX_HCODES_ALPHA_NUM_SYM_ONLY (const unsigned char[]) {0x00, 0x80, 0xC0, 0x00, 0x00} /// Length of each Alpha numeric and symbol hcodes #define USX_HCODE_LENS_ALPHA_NUM_SYM_ONLY (const unsigned char[]) {1, 2, 2, 0, 0} /// Horizontal codes preset favouring Alphabet content #define USX_HCODES_FAVOR_ALPHA (const unsigned char[]) {0x00, 0x80, 0xA0, 0xC0, 0xE0} /// Length of each hcode favouring Alpha content #define USX_HCODE_LENS_FAVOR_ALPHA (const unsigned char[]) {1, 3, 3, 3, 3} /// Horizontal codes preset favouring repeating sequences #define USX_HCODES_FAVOR_DICT (const unsigned char[]) {0x00, 0x40, 0xC0, 0x80, 0xE0} /// Length of each hcode favouring repeating sequences #define USX_HCODE_LENS_FAVOR_DICT (const unsigned char[]) {2, 2, 3, 2, 3} /// Horizontal codes preset favouring symbols #define USX_HCODES_FAVOR_SYM (const unsigned char[]) {0x80, 0x00, 0xA0, 0xC0, 0xE0} /// Length of each hcode favouring symbols #define USX_HCODE_LENS_FAVOR_SYM (const unsigned char[]) {3, 1, 3, 3, 3} //#define USX_HCODES_FAVOR_UMLAUT {0x00, 0x40, 0xE0, 0xC0, 0x80} //#define USX_HCODE_LENS_FAVOR_UMLAUT {2, 2, 3, 3, 2} /// Horizontal codes preset favouring umlaut letters #define USX_HCODES_FAVOR_UMLAUT (const unsigned char[]) {0x80, 0xA0, 0xC0, 0xE0, 0x00} /// Length of each hcode favouring umlaut letters #define USX_HCODE_LENS_FAVOR_UMLAUT (const unsigned char[]) {3, 3, 3, 3, 1} /// Horizontal codes preset for no repeating sequences #define USX_HCODES_NO_DICT (const unsigned char[]) {0x00, 0x40, 0x80, 0x00, 0xC0} /// Length of each hcode for no repeating sequences #define USX_HCODE_LENS_NO_DICT (const unsigned char[]) {2, 2, 2, 0, 2} /// Horizontal codes preset for no Unicode characters #define USX_HCODES_NO_UNI (const unsigned char[]) {0x00, 0x40, 0x80, 0xC0, 0x00} /// Length of each hcode for no Unicode characters #define USX_HCODE_LENS_NO_UNI (const unsigned char[]) {2, 2, 2, 2, 0} /// Default frequently occuring sequences. When composition of text is know beforehand, the other sequences in this section can be used to achieve more compression. #define USX_FREQ_SEQ_DFLT (const char *[]) {"\": \"", "\": ", "", "=\"", "\":\"", "://"} /// Frequently occuring sequences in text content #define USX_FREQ_SEQ_TXT (const char *[]) {" the ", " and ", "tion", " with", "ing", "ment"} /// Frequently occuring sequences in URL content #define USX_FREQ_SEQ_URL (const char *[]) {"https://", "www.", ".com", "http://", ".org", ".net"} /// Frequently occuring sequences in JSON content #define USX_FREQ_SEQ_JSON (const char *[]) {"\": \"", "\": ", "\",", "}}}", "\":\"", "}}"} /// Frequently occuring sequences in HTML content #define USX_FREQ_SEQ_HTML (const char *[]) {"", "=\"", "div", "href", "class", "
"} /// Frequently occuring sequences in XML content #define USX_FREQ_SEQ_XML (const char *[]) {"", "=\"", "\">", "