#ifndef DARTS_H_ #define DARTS_H_ #include #include #include #define DARTS_VERSION "0.32" // DARTS_THROW() throws a whose message starts with the // file name and the line number. For example, DARTS_THROW("error message") at // line 123 of "darts.h" throws a which has a pointer to // "darts.h:123: exception: error message". The message is available by using // what() as well as that of . #define DARTS_INT_TO_STR(value) #value #define DARTS_LINE_TO_STR(line) DARTS_INT_TO_STR(line) #define DARTS_LINE_STR DARTS_LINE_TO_STR(__LINE__) #define DARTS_THROW(msg) throw Darts::Details::Exception( \ __FILE__ ":" DARTS_LINE_STR ": exception: " msg) namespace Darts { // The following namespace hides the internal types and classes. namespace Details { // This header assumes that and are 32-bit integer types. // // Darts-clone keeps values associated with keys. The type of the values is // . Note that the values must be positive integers because the // most significant bit (MSB) of each value is used to represent whether the // corresponding unit is a leaf or not. Also, the keys are represented by // sequences of s. is the unsigned type of . typedef char char_type; typedef unsigned char uchar_type; typedef int value_type; // The main structure of Darts-clone is an array of s, and the // unit type is actually a wrapper of . typedef unsigned int id_type; // is the type of callback functions for reporting the // progress of building a dictionary. See also build() of . // The 1st argument receives the progress value and the 2nd argument receives // the maximum progress value. A usage example is to show the progress // percentage, 100.0 * (the 1st argument) / (the 2nd argument). typedef int (*progress_func_type)(std::size_t, std::size_t); // is the type of double-array units and it is a wrapper of // in practice. class DoubleArrayUnit { public: DoubleArrayUnit() : unit_() {} // has_leaf() returns whether a leaf unit is immediately derived from the // unit (true) or not (false). bool has_leaf() const { return ((unit_ >> 8) & 1) == 1; } // value() returns the value stored in the unit, and thus value() is // available when and only when the unit is a leaf unit. value_type value() const { return static_cast(unit_ & ((1U << 31) - 1)); } // label() returns the label associted with the unit. Note that a leaf unit // always returns an invalid label. For this feature, leaf unit's label() // returns an that has the MSB of 1. id_type label() const { return unit_ & ((1U << 31) | 0xFF); } // offset() returns the offset from the unit to its derived units. id_type offset() const { return (unit_ >> 10) << ((unit_ & (1U << 9)) >> 6); } private: id_type unit_; // Copyable. }; // Darts-clone throws an for memory allocation failure, invalid // arguments or a too large offset. The last case means that there are too many // keys in the given set of keys. Note that the `msg' of must be a // constant or static string because an keeps only a pointer to // that string. class Exception : public std::exception { public: explicit Exception(const char *msg = NULL) throw() : msg_(msg) {} Exception(const Exception &rhs) throw() : msg_(rhs.msg_) {} virtual ~Exception() throw() {} // overrides what() of . virtual const char *what() const throw() { return (msg_ != NULL) ? msg_ : ""; } private: const char *msg_; // Disallows operator=. Exception &operator=(const Exception &); }; } // namespace Details // is the interface of Darts-clone. Note that other // classes should not be accessed from outside. // // has 4 template arguments but only the 3rd one is used as // the type of values. Note that the given is used only from outside, and // the internal value type is not changed from . // In build(), given values are casted from to // by using static_cast. On the other hand, values are casted from // to in searching dictionaries. template class DoubleArrayImpl { public: // Even if this is changed, the internal value type is still // . Other types, such as 64-bit integer types // and floating-point number types, should not be used. typedef T value_type; // A key is reprenseted by a sequence of s. For example, // exactMatchSearch() takes a . typedef Details::char_type key_type; // In searching dictionaries, the values associated with the matched keys are // stored into or returned as s. typedef value_type result_type; // enables applications to get the lengths of the matched // keys in addition to the values. struct result_pair_type { value_type value; std::size_t length; }; // The constructor initializes member variables with 0 and NULLs. DoubleArrayImpl() : size_(0), array_(NULL), buf_(NULL) {} // The destructor frees memory allocated for units and then initializes // member variables with 0 and NULLs. virtual ~DoubleArrayImpl() { clear(); } // has 2 kinds of set_result()s. The 1st set_result() is to // set a value to a . The 2nd set_result() is to set a value and // a length to a . By using set_result()s, search methods // can return the 2 kinds of results in the same way. // Why the set_result()s are non-static? It is for compatibility. // // The 1st set_result() takes a length as the 3rd argument but it is not // used. If a compiler does a good job, codes for getting the length may be // removed. void set_result(value_type *result, value_type value, std::size_t) const { *result = value; } // The 2nd set_result() uses both `value' and `length'. void set_result(result_pair_type *result, value_type value, std::size_t length) const { result->value = value; result->length = length; } // set_array() calls clear() in order to free memory allocated to the old // array and then sets a new array. This function is useful to set a memory- // mapped array. Note that the array set by set_array() is not freed in // clear() and the destructor of . // set_array() can also set the size of the new array but the size is not // used in search methods. So it works well even if the 2nd argument is 0 or // omitted. Remember that size() and total_size() returns 0 in such a case. void set_array(const void *ptr, std::size_t size = 0) { clear(); array_ = static_cast(ptr); size_ = size; } // array() returns a pointer to the array of units. const void *array() const { return array_; } // clear() frees memory allocated to units and then initializes member // variables with 0 and NULLs. Note that clear() does not free memory if the // array of units was set by set_array(). In such a case, `array_' is not // NULL and `buf_' is NULL. void clear() { size_ = 0; array_ = NULL; if (buf_ != NULL) { delete[] buf_; buf_ = NULL; } } // unit_size() returns the size of each unit. The size must be 4 bytes. std::size_t unit_size() const { return sizeof(unit_type); } // size() returns the number of units. It can be 0 if set_array() is used. std::size_t size() const { return size_; } // total_size() returns the number of bytes allocated to the array of units. // It can be 0 if set_array() is used. std::size_t total_size() const { return unit_size() * size(); } // nonzero_size() exists for compatibility. It always returns the number of // units because it takes long time to count the number of non-zero units. std::size_t nonzero_size() const { return size(); } // build() constructs a dictionary from given key-value pairs. If `lengths' // is NULL, `keys' is handled as an array of zero-terminated strings. If // `values' is NULL, the index in `keys' is associated with each key, i.e. // the ith key has (i - 1) as its value. // Note that the key-value pairs must be arranged in key order and the values // must not be negative. Also, if there are duplicate keys, only the first // pair will be stored in the resultant dictionary. // `progress_func' is a pointer to a callback function. If it is not NULL, // it will be called in build() so that the caller can check the progress of // dictionary construction. For details, please see the definition of // . // The return value of build() is 0, and it indicates the success of the // operation. Otherwise, build() throws a , which is a // derived class of . // build() uses another construction algorithm if `values' is not NULL. In // this case, Darts-clone uses a Directed Acyclic Word Graph (DAWG) instead // of a trie because a DAWG is likely to be more compact than a trie. int build(std::size_t num_keys, const key_type * const *keys, const std::size_t *lengths = NULL, const value_type *values = NULL, Details::progress_func_type progress_func = NULL); // open() reads an array of units from the specified file. And if it goes // well, the old array will be freed and replaced with the new array read // from the file. `offset' specifies the number of bytes to be skipped before // reading an array. `size' specifies the number of bytes to be read from the // file. If the `size' is 0, the whole file will be read. // open() returns 0 iff the operation succeeds. Otherwise, it returns a // non-zero value or throws a . The exception is thrown // when and only when a memory allocation fails. int open(const char *file_name, const char *mode = "rb", std::size_t offset = 0, std::size_t size = 0); // save() writes the array of units into the specified file. `offset' // specifies the number of bytes to be skipped before writing the array. // open() returns 0 iff the operation succeeds. Otherwise, it returns a // non-zero value. int save(const char *file_name, const char *mode = "wb", std::size_t offset = 0) const; // The 1st exactMatchSearch() tests whether the given key exists or not, and // if it exists, its value and length are set to `result'. Otherwise, the // value and the length of `result' are set to -1 and 0 respectively. // Note that if `length' is 0, `key' is handled as a zero-terminated string. // `node_pos' specifies the start position of matching. This argument enables // the combination of exactMatchSearch() and traverse(). For example, if you // want to test "xyzA", "xyzBC", and "xyzDE", you can use traverse() to get // the node position corresponding to "xyz" and then you can use // exactMatchSearch() to test "A", "BC", and "DE" from that position. // Note that the length of `result' indicates the length from the `node_pos'. // In the above example, the lengths are { 1, 2, 2 }, not { 4, 5, 5 }. template void exactMatchSearch(const key_type *key, U &result, std::size_t length = 0, std::size_t node_pos = 0) const { result = exactMatchSearch(key, length, node_pos); } // The 2nd exactMatchSearch() returns a result instead of updating the 2nd // argument. So, the following exactMatchSearch() has only 3 arguments. template inline U exactMatchSearch(const key_type *key, std::size_t length = 0, std::size_t node_pos = 0) const; // commonPrefixSearch() searches for keys which match a prefix of the given // string. If `length' is 0, `key' is handled as a zero-terminated string. // The values and the lengths of at most `max_num_results' matched keys are // stored in `results'. commonPrefixSearch() returns the number of matched // keys. Note that the return value can be larger than `max_num_results' if // there are more than `max_num_results' matches. If you want to get all the // results, allocate more spaces and call commonPrefixSearch() again. // `node_pos' works as well as in exactMatchSearch(). template inline std::size_t commonPrefixSearch(const key_type *key, U *results, std::size_t max_num_results, std::size_t length = 0, std::size_t node_pos = 0) const; // In Darts-clone, a dictionary is a deterministic finite-state automaton // (DFA) and traverse() tests transitions on the DFA. The initial state is // `node_pos' and traverse() chooses transitions labeled key[key_pos], // key[key_pos + 1], ... in order. If there is not a transition labeled // key[key_pos + i], traverse() terminates the transitions at that state and // returns -2. Otherwise, traverse() ends without a termination and returns // -1 or a nonnegative value, -1 indicates that the final state was not an // accept state. When a nonnegative value is returned, it is the value // associated with the final accept state. That is, traverse() returns the // value associated with the given key if it exists. Note that traverse() // updates `node_pos' and `key_pos' after each transition. inline value_type traverse(const key_type *key, std::size_t &node_pos, std::size_t &key_pos, std::size_t length = 0) const; private: typedef Details::uchar_type uchar_type; typedef Details::id_type id_type; typedef Details::DoubleArrayUnit unit_type; std::size_t size_; const unit_type *array_; unit_type *buf_; // Disallows copy and assignment. DoubleArrayImpl(const DoubleArrayImpl &); DoubleArrayImpl &operator=(const DoubleArrayImpl &); }; // is the typical instance of . It uses // as the type of values and it is suitable for most cases. typedef DoubleArrayImpl DoubleArray; // The interface section ends here. For using Darts-clone, there is no need // to read the remaining section, which gives the implementation of // Darts-clone. // // Member functions of DoubleArrayImpl (except build()). // template int DoubleArrayImpl::open(const char *file_name, const char *mode, std::size_t offset, std::size_t size) { #ifdef _MSC_VER std::FILE *file; if (::fopen_s(&file, file_name, mode) != 0) { return -1; } #else std::FILE *file = std::fopen(file_name, mode); if (file == NULL) { return -1; } #endif if (size == 0) { if (std::fseek(file, 0, SEEK_END) != 0) { std::fclose(file); return -1; } size = std::ftell(file) - offset; } size /= unit_size(); if (size < 256 || (size & 0xFF) != 0) { std::fclose(file); return -1; } if (std::fseek(file, offset, SEEK_SET) != 0) { std::fclose(file); return -1; } unit_type units[256]; if (std::fread(units, unit_size(), 256, file) != 256) { std::fclose(file); return -1; } if (units[0].label() != '\0' || units[0].has_leaf() || units[0].offset() == 0 || units[0].offset() >= 512) { std::fclose(file); return -1; } for (id_type i = 1; i < 256; ++i) { if (units[i].label() <= 0xFF && units[i].offset() >= size) { std::fclose(file); return -1; } } unit_type *buf; try { buf = new unit_type[size]; for (id_type i = 0; i < 256; ++i) { buf[i] = units[i]; } } catch (const std::bad_alloc &) { std::fclose(file); DARTS_THROW("failed to open double-array: std::bad_alloc"); } if (size > 256) { if (std::fread(buf + 256, unit_size(), size - 256, file) != size - 256) { std::fclose(file); delete[] buf; return -1; } } std::fclose(file); clear(); size_ = size; array_ = buf; buf_ = buf; return 0; } template int DoubleArrayImpl::save(const char *file_name, const char *mode, std::size_t) const { if (size() == 0) { return -1; } #ifdef _MSC_VER std::FILE *file; if (::fopen_s(&file, file_name, mode) != 0) { return -1; } #else std::FILE *file = std::fopen(file_name, mode); if (file == NULL) { return -1; } #endif if (std::fwrite(array_, unit_size(), size(), file) != size()) { std::fclose(file); return -1; } std::fclose(file); return 0; } template template inline U DoubleArrayImpl::exactMatchSearch(const key_type *key, std::size_t length, std::size_t node_pos) const { U result; set_result(&result, static_cast(-1), 0); unit_type unit = array_[node_pos]; if (length != 0) { for (std::size_t i = 0; i < length; ++i) { node_pos ^= unit.offset() ^ static_cast(key[i]); unit = array_[node_pos]; if (unit.label() != static_cast(key[i])) { return result; } } } else { for ( ; key[length] != '\0'; ++length) { node_pos ^= unit.offset() ^ static_cast(key[length]); unit = array_[node_pos]; if (unit.label() != static_cast(key[length])) { return result; } } } if (!unit.has_leaf()) { return result; } unit = array_[node_pos ^ unit.offset()]; set_result(&result, static_cast(unit.value()), length); return result; } template template inline std::size_t DoubleArrayImpl::commonPrefixSearch( const key_type *key, U *results, std::size_t max_num_results, std::size_t length, std::size_t node_pos) const { std::size_t num_results = 0; unit_type unit = array_[node_pos]; node_pos ^= unit.offset(); if (length != 0) { for (std::size_t i = 0; i < length; ++i) { node_pos ^= static_cast(key[i]); unit = array_[node_pos]; if (unit.label() != static_cast(key[i])) { return num_results; } node_pos ^= unit.offset(); if (unit.has_leaf()) { if (num_results < max_num_results) { set_result(&results[num_results], static_cast( array_[node_pos].value()), i + 1); } ++num_results; } } } else { for ( ; key[length] != '\0'; ++length) { node_pos ^= static_cast(key[length]); unit = array_[node_pos]; if (unit.label() != static_cast(key[length])) { return num_results; } node_pos ^= unit.offset(); if (unit.has_leaf()) { if (num_results < max_num_results) { set_result(&results[num_results], static_cast( array_[node_pos].value()), length + 1); } ++num_results; } } } return num_results; } template inline typename DoubleArrayImpl::value_type DoubleArrayImpl::traverse(const key_type *key, std::size_t &node_pos, std::size_t &key_pos, std::size_t length) const { id_type id = static_cast(node_pos); unit_type unit = array_[id]; if (length != 0) { for ( ; key_pos < length; ++key_pos) { id ^= unit.offset() ^ static_cast(key[key_pos]); unit = array_[id]; if (unit.label() != static_cast(key[key_pos])) { return static_cast(-2); } node_pos = id; } } else { for ( ; key[key_pos] != '\0'; ++key_pos) { id ^= unit.offset() ^ static_cast(key[key_pos]); unit = array_[id]; if (unit.label() != static_cast(key[key_pos])) { return static_cast(-2); } node_pos = id; } } if (!unit.has_leaf()) { return static_cast(-1); } unit = array_[id ^ unit.offset()]; return static_cast(unit.value()); } namespace Details { // // Memory management of array. // template class AutoArray { public: explicit AutoArray(T *array = NULL) : array_(array) {} ~AutoArray() { clear(); } const T &operator[](std::size_t id) const { return array_[id]; } T &operator[](std::size_t id) { return array_[id]; } bool empty() const { return array_ == NULL; } void clear() { if (array_ != NULL) { delete[] array_; array_ = NULL; } } void swap(AutoArray *array) { T *temp = array_; array_ = array->array_; array->array_ = temp; } void reset(T *array = NULL) { AutoArray(array).swap(this); } private: T *array_; // Disallows copy and assignment. AutoArray(const AutoArray &); AutoArray &operator=(const AutoArray &); }; // // Memory management of resizable array. // template class AutoPool { public: AutoPool() : buf_(), size_(0), capacity_(0) {} ~AutoPool() { clear(); } const T &operator[](std::size_t id) const { return *(reinterpret_cast(&buf_[0]) + id); } T &operator[](std::size_t id) { return *(reinterpret_cast(&buf_[0]) + id); } bool empty() const { return size_ == 0; } std::size_t size() const { return size_; } void clear() { resize(0); buf_.clear(); size_ = 0; capacity_ = 0; } void push_back(const T &value) { append(value); } void pop_back() { (*this)[--size_].~T(); } void append() { if (size_ == capacity_) resize_buf(size_ + 1); new(&(*this)[size_++]) T; } void append(const T &value) { if (size_ == capacity_) resize_buf(size_ + 1); new(&(*this)[size_++]) T(value); } void resize(std::size_t size) { while (size_ > size) { (*this)[--size_].~T(); } if (size > capacity_) { resize_buf(size); } while (size_ < size) { new(&(*this)[size_++]) T; } } void resize(std::size_t size, const T &value) { while (size_ > size) { (*this)[--size_].~T(); } if (size > capacity_) { resize_buf(size); } while (size_ < size) { new(&(*this)[size_++]) T(value); } } void reserve(std::size_t size) { if (size > capacity_) { resize_buf(size); } } private: AutoArray buf_; std::size_t size_; std::size_t capacity_; // Disallows copy and assignment. AutoPool(const AutoPool &); AutoPool &operator=(const AutoPool &); void resize_buf(std::size_t size); }; template void AutoPool::resize_buf(std::size_t size) { std::size_t capacity; if (size >= capacity_ * 2) { capacity = size; } else { capacity = 1; while (capacity < size) { capacity <<= 1; } } AutoArray buf; try { buf.reset(new char[sizeof(T) * capacity]); } catch (const std::bad_alloc &) { DARTS_THROW("failed to resize pool: std::bad_alloc"); } if (size_ > 0) { T *src = reinterpret_cast(&buf_[0]); T *dest = reinterpret_cast(&buf[0]); for (std::size_t i = 0; i < size_; ++i) { new(&dest[i]) T(src[i]); src[i].~T(); } } buf_.swap(&buf); capacity_ = capacity; } // // Memory management of stack. // template class AutoStack { public: AutoStack() : pool_() {} ~AutoStack() { clear(); } const T &top() const { return pool_[size() - 1]; } T &top() { return pool_[size() - 1]; } bool empty() const { return pool_.empty(); } std::size_t size() const { return pool_.size(); } void push(const T &value) { pool_.push_back(value); } void pop() { pool_.pop_back(); } void clear() { pool_.clear(); } private: AutoPool pool_; // Disallows copy and assignment. AutoStack(const AutoStack &); AutoStack &operator=(const AutoStack &); }; // // Succinct bit vector. // class BitVector { public: BitVector() : units_(), ranks_(), num_ones_(0), size_(0) {} ~BitVector() { clear(); } bool operator[](std::size_t id) const { return (units_[id / UNIT_SIZE] >> (id % UNIT_SIZE) & 1) == 1; } id_type rank(std::size_t id) const { std::size_t unit_id = id / UNIT_SIZE; return ranks_[unit_id] + pop_count(units_[unit_id] & (~0U >> (UNIT_SIZE - (id % UNIT_SIZE) - 1))); } void set(std::size_t id, bool bit) { if (bit) { units_[id / UNIT_SIZE] |= 1U << (id % UNIT_SIZE); } else { units_[id / UNIT_SIZE] &= ~(1U << (id % UNIT_SIZE)); } } bool empty() const { return units_.empty(); } std::size_t num_ones() const { return num_ones_; } std::size_t size() const { return size_; } void append() { if ((size_ % UNIT_SIZE) == 0) { units_.append(0); } ++size_; } void build(); void clear() { units_.clear(); ranks_.clear(); } private: enum { UNIT_SIZE = sizeof(id_type) * 8 }; AutoPool units_; AutoArray ranks_; std::size_t num_ones_; std::size_t size_; // Disallows copy and assignment. BitVector(const BitVector &); BitVector &operator=(const BitVector &); static id_type pop_count(id_type unit) { unit = ((unit & 0xAAAAAAAA) >> 1) + (unit & 0x55555555); unit = ((unit & 0xCCCCCCCC) >> 2) + (unit & 0x33333333); unit = ((unit >> 4) + unit) & 0x0F0F0F0F; unit += unit >> 8; unit += unit >> 16; return unit & 0xFF; } }; inline void BitVector::build() { try { ranks_.reset(new id_type[units_.size()]); } catch (const std::bad_alloc &) { DARTS_THROW("failed to build rank index: std::bad_alloc"); } num_ones_ = 0; for (std::size_t i = 0; i < units_.size(); ++i) { ranks_[i] = num_ones_; num_ones_ += pop_count(units_[i]); } } // // Keyset. // template class Keyset { public: Keyset(std::size_t num_keys, const char_type * const *keys, const std::size_t *lengths, const T *values) : num_keys_(num_keys), keys_(keys), lengths_(lengths), values_(values) {} std::size_t num_keys() const { return num_keys_; } const char_type *keys(std::size_t id) const { return keys_[id]; } uchar_type keys(std::size_t key_id, std::size_t char_id) const { if (has_lengths() && char_id >= lengths_[key_id]) return '\0'; return keys_[key_id][char_id]; } bool has_lengths() const { return lengths_ != NULL; } std::size_t lengths(std::size_t id) const { if (has_lengths()) { return lengths_[id]; } std::size_t length = 0; while (keys_[id][length] != '\0') { ++length; } return length; } bool has_values() const { return values_ != NULL; } const value_type values(std::size_t id) const { if (has_values()) { return static_cast(values_[id]); } return static_cast(id); } private: std::size_t num_keys_; const char_type * const * keys_; const std::size_t *lengths_; const T *values_; // Disallows copy and assignment. Keyset(const Keyset &); Keyset &operator=(const Keyset &); }; // // Node of Directed Acyclic Word Graph (DAWG). // class DawgNode { public: DawgNode() : child_(0), sibling_(0), label_('\0'), is_state_(false), has_sibling_(false) {} void set_child(id_type child) { child_ = child; } void set_sibling(id_type sibling) { sibling_ = sibling; } void set_value(value_type value) { child_ = value; } void set_label(uchar_type label) { label_ = label; } void set_is_state(bool is_state) { is_state_ = is_state; } void set_has_sibling(bool has_sibling) { has_sibling_ = has_sibling; } id_type child() const { return child_; } id_type sibling() const { return sibling_; } value_type value() const { return static_cast(child_); } uchar_type label() const { return label_; } bool is_state() const { return is_state_; } bool has_sibling() const { return has_sibling_; } id_type unit() const { if (label_ == '\0') { return (child_ << 1) | (has_sibling_ ? 1 : 0); } return (child_ << 2) | (is_state_ ? 2 : 0) | (has_sibling_ ? 1 : 0); } private: id_type child_; id_type sibling_; uchar_type label_; bool is_state_; bool has_sibling_; // Copyable. }; // // Fixed unit of Directed Acyclic Word Graph (DAWG). // class DawgUnit { public: explicit DawgUnit(id_type unit = 0) : unit_(unit) {} DawgUnit(const DawgUnit &unit) : unit_(unit.unit_) {} DawgUnit &operator=(id_type unit) { unit_ = unit; return *this; } id_type unit() const { return unit_; } id_type child() const { return unit_ >> 2; } bool has_sibling() const { return (unit_ & 1) == 1; } value_type value() const { return static_cast(unit_ >> 1); } bool is_state() const { return (unit_ & 2) == 2; } private: id_type unit_; // Copyable. }; // // Directed Acyclic Word Graph (DAWG) builder. // class DawgBuilder { public: DawgBuilder() : nodes_(), units_(), labels_(), is_intersections_(), table_(), node_stack_(), recycle_bin_(), num_states_(0) {} ~DawgBuilder() { clear(); } id_type root() const { return 0; } id_type child(id_type id) const { return units_[id].child(); } id_type sibling(id_type id) const { return units_[id].has_sibling() ? (id + 1) : 0; } int value(id_type id) const { return units_[id].value(); } bool is_leaf(id_type id) const { return label(id) == '\0'; } uchar_type label(id_type id) const { return labels_[id]; } bool is_intersection(id_type id) const { return is_intersections_[id]; } id_type intersection_id(id_type id) const { return is_intersections_.rank(id) - 1; } std::size_t num_intersections() const { return is_intersections_.num_ones(); } std::size_t size() const { return units_.size(); } void init(); void finish(); void insert(const char *key, std::size_t length, value_type value); void clear(); private: enum { INITIAL_TABLE_SIZE = 1 << 10 }; AutoPool nodes_; AutoPool units_; AutoPool labels_; BitVector is_intersections_; AutoPool table_; AutoStack node_stack_; AutoStack recycle_bin_; std::size_t num_states_; // Disallows copy and assignment. DawgBuilder(const DawgBuilder &); DawgBuilder &operator=(const DawgBuilder &); void flush(id_type id); void expand_table(); id_type find_unit(id_type id, id_type *hash_id) const; id_type find_node(id_type node_id, id_type *hash_id) const; bool are_equal(id_type node_id, id_type unit_id) const; id_type hash_unit(id_type id) const; id_type hash_node(id_type id) const; id_type append_node(); id_type append_unit(); void free_node(id_type id) { recycle_bin_.push(id); } static id_type hash(id_type key) { key = ~key + (key << 15); // key = (key << 15) - key - 1; key = key ^ (key >> 12); key = key + (key << 2); key = key ^ (key >> 4); key = key * 2057; // key = (key + (key << 3)) + (key << 11); key = key ^ (key >> 16); return key; } }; inline void DawgBuilder::init() { table_.resize(INITIAL_TABLE_SIZE, 0); append_node(); append_unit(); num_states_ = 1; nodes_[0].set_label(0xFF); node_stack_.push(0); } inline void DawgBuilder::finish() { flush(0); units_[0] = nodes_[0].unit(); labels_[0] = nodes_[0].label(); nodes_.clear(); table_.clear(); node_stack_.clear(); recycle_bin_.clear(); is_intersections_.build(); } inline void DawgBuilder::insert(const char *key, std::size_t length, value_type value) { if (value < 0) { DARTS_THROW("failed to insert key: negative value"); } else if (length == 0) { DARTS_THROW("failed to insert key: zero-length key"); } id_type id = 0; std::size_t key_pos = 0; for ( ; key_pos <= length; ++key_pos) { id_type child_id = nodes_[id].child(); if (child_id == 0) { break; } uchar_type key_label = static_cast(key[key_pos]); if (key_pos < length && key_label == '\0') { DARTS_THROW("failed to insert key: invalid null character"); } uchar_type unit_label = nodes_[child_id].label(); if (key_label < unit_label) { DARTS_THROW("failed to insert key: wrong key order"); } else if (key_label > unit_label) { nodes_[child_id].set_has_sibling(true); flush(child_id); break; } id = child_id; } if (key_pos > length) { return; } for ( ; key_pos <= length; ++key_pos) { uchar_type key_label = static_cast( (key_pos < length) ? key[key_pos] : '\0'); id_type child_id = append_node(); if (nodes_[id].child() == 0) { nodes_[child_id].set_is_state(true); } nodes_[child_id].set_sibling(nodes_[id].child()); nodes_[child_id].set_label(key_label); nodes_[id].set_child(child_id); node_stack_.push(child_id); id = child_id; } nodes_[id].set_value(value); } inline void DawgBuilder::clear() { nodes_.clear(); units_.clear(); labels_.clear(); is_intersections_.clear(); table_.clear(); node_stack_.clear(); recycle_bin_.clear(); num_states_ = 0; } inline void DawgBuilder::flush(id_type id) { while (node_stack_.top() != id) { id_type node_id = node_stack_.top(); node_stack_.pop(); if (num_states_ >= table_.size() - (table_.size() >> 2)) { expand_table(); } id_type num_siblings = 0; for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) { ++num_siblings; } id_type hash_id; id_type match_id = find_node(node_id, &hash_id); if (match_id != 0) { is_intersections_.set(match_id, true); } else { id_type unit_id = 0; for (id_type i = 0; i < num_siblings; ++i) { unit_id = append_unit(); } for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) { units_[unit_id] = nodes_[i].unit(); labels_[unit_id] = nodes_[i].label(); --unit_id; } match_id = unit_id + 1; table_[hash_id] = match_id; ++num_states_; } for (id_type i = node_id, next; i != 0; i = next) { next = nodes_[i].sibling(); free_node(i); } nodes_[node_stack_.top()].set_child(match_id); } node_stack_.pop(); } inline void DawgBuilder::expand_table() { std::size_t table_size = table_.size() << 1; table_.clear(); table_.resize(table_size, 0); for (std::size_t i = 1; i < units_.size(); ++i) { id_type id = static_cast(i); if (labels_[id] == '\0' || units_[id].is_state()) { id_type hash_id; find_unit(id, &hash_id); table_[hash_id] = id; } } } inline id_type DawgBuilder::find_unit(id_type id, id_type *hash_id) const { *hash_id = hash_unit(id) % table_.size(); for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) { id_type unit_id = table_[*hash_id]; if (unit_id == 0) { break; } // There must not be the same unit. } return 0; } inline id_type DawgBuilder::find_node(id_type node_id, id_type *hash_id) const { *hash_id = hash_node(node_id) % table_.size(); for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) { id_type unit_id = table_[*hash_id]; if (unit_id == 0) { break; } if (are_equal(node_id, unit_id)) { return unit_id; } } return 0; } inline bool DawgBuilder::are_equal(id_type node_id, id_type unit_id) const { for (id_type i = nodes_[node_id].sibling(); i != 0; i = nodes_[i].sibling()) { if (units_[unit_id].has_sibling() == false) { return false; } ++unit_id; } if (units_[unit_id].has_sibling() == true) { return false; } for (id_type i = node_id; i != 0; i = nodes_[i].sibling(), --unit_id) { if (nodes_[i].unit() != units_[unit_id].unit() || nodes_[i].label() != labels_[unit_id]) { return false; } } return true; } inline id_type DawgBuilder::hash_unit(id_type id) const { id_type hash_value = 0; for ( ; id != 0; ++id) { id_type unit = units_[id].unit(); uchar_type label = labels_[id]; hash_value ^= hash((label << 24) ^ unit); if (units_[id].has_sibling() == false) { break; } } return hash_value; } inline id_type DawgBuilder::hash_node(id_type id) const { id_type hash_value = 0; for ( ; id != 0; id = nodes_[id].sibling()) { id_type unit = nodes_[id].unit(); uchar_type label = nodes_[id].label(); hash_value ^= hash((label << 24) ^ unit); } return hash_value; } inline id_type DawgBuilder::append_unit() { is_intersections_.append(); units_.append(); labels_.append(); return static_cast(is_intersections_.size() - 1); } inline id_type DawgBuilder::append_node() { id_type id; if (recycle_bin_.empty()) { id = static_cast(nodes_.size()); nodes_.append(); } else { id = recycle_bin_.top(); nodes_[id] = DawgNode(); recycle_bin_.pop(); } return id; } // // Unit of double-array builder. // class DoubleArrayBuilderUnit { public: DoubleArrayBuilderUnit() : unit_(0) {} void set_has_leaf(bool has_leaf) { if (has_leaf) { unit_ |= 1U << 8; } else { unit_ &= ~(1U << 8); } } void set_value(value_type value) { unit_ = value | (1U << 31); } void set_label(uchar_type label) { unit_ = (unit_ & ~0xFFU) | label; } void set_offset(id_type offset) { if (offset >= 1U << 29) { DARTS_THROW("failed to modify unit: too large offset"); } unit_ &= (1U << 31) | (1U << 8) | 0xFF; if (offset < 1U << 21) { unit_ |= (offset << 10); } else { unit_ |= (offset << 2) | (1U << 9); } } private: id_type unit_; // Copyable. }; // // Extra unit of double-array builder. // class DoubleArrayBuilderExtraUnit { public: DoubleArrayBuilderExtraUnit() : prev_(0), next_(0), is_fixed_(false), is_used_(false) {} void set_prev(id_type prev) { prev_ = prev; } void set_next(id_type next) { next_ = next; } void set_is_fixed(bool is_fixed) { is_fixed_ = is_fixed; } void set_is_used(bool is_used) { is_used_ = is_used; } id_type prev() const { return prev_; } id_type next() const { return next_; } bool is_fixed() const { return is_fixed_; } bool is_used() const { return is_used_; } private: id_type prev_; id_type next_; bool is_fixed_; bool is_used_; // Copyable. }; // // DAWG -> double-array converter. // class DoubleArrayBuilder { public: explicit DoubleArrayBuilder(progress_func_type progress_func) : progress_func_(progress_func), units_(), extras_(), labels_(), table_(), extras_head_(0) {} ~DoubleArrayBuilder() { clear(); } template void build(const Keyset &keyset); void copy(std::size_t *size_ptr, DoubleArrayUnit **buf_ptr) const; void clear(); private: enum { BLOCK_SIZE = 256 }; enum { NUM_EXTRA_BLOCKS = 16 }; enum { NUM_EXTRAS = BLOCK_SIZE * NUM_EXTRA_BLOCKS }; enum { UPPER_MASK = 0xFF << 21 }; enum { LOWER_MASK = 0xFF }; typedef DoubleArrayBuilderUnit unit_type; typedef DoubleArrayBuilderExtraUnit extra_type; progress_func_type progress_func_; AutoPool units_; AutoArray extras_; AutoPool labels_; AutoArray table_; id_type extras_head_; // Disallows copy and assignment. DoubleArrayBuilder(const DoubleArrayBuilder &); DoubleArrayBuilder &operator=(const DoubleArrayBuilder &); std::size_t num_blocks() const { return units_.size() / BLOCK_SIZE; } const extra_type &extras(id_type id) const { return extras_[id % NUM_EXTRAS]; } extra_type &extras(id_type id) { return extras_[id % NUM_EXTRAS]; } template void build_dawg(const Keyset &keyset, DawgBuilder *dawg_builder); void build_from_dawg(const DawgBuilder &dawg); void build_from_dawg(const DawgBuilder &dawg, id_type dawg_id, id_type dic_id); id_type arrange_from_dawg(const DawgBuilder &dawg, id_type dawg_id, id_type dic_id); template void build_from_keyset(const Keyset &keyset); template void build_from_keyset(const Keyset &keyset, std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id); template id_type arrange_from_keyset(const Keyset &keyset, std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id); id_type find_valid_offset(id_type id) const; bool is_valid_offset(id_type id, id_type offset) const; void reserve_id(id_type id); void expand_units(); void fix_all_blocks(); void fix_block(id_type block_id); }; template void DoubleArrayBuilder::build(const Keyset &keyset) { if (keyset.has_values()) { Details::DawgBuilder dawg_builder; build_dawg(keyset, &dawg_builder); build_from_dawg(dawg_builder); dawg_builder.clear(); } else { build_from_keyset(keyset); } } inline void DoubleArrayBuilder::copy(std::size_t *size_ptr, DoubleArrayUnit **buf_ptr) const { if (size_ptr != NULL) { *size_ptr = units_.size(); } if (buf_ptr != NULL) { *buf_ptr = new DoubleArrayUnit[units_.size()]; unit_type *units = reinterpret_cast(*buf_ptr); for (std::size_t i = 0; i < units_.size(); ++i) { units[i] = units_[i]; } } } inline void DoubleArrayBuilder::clear() { units_.clear(); extras_.clear(); labels_.clear(); table_.clear(); extras_head_ = 0; } template void DoubleArrayBuilder::build_dawg(const Keyset &keyset, DawgBuilder *dawg_builder) { dawg_builder->init(); for (std::size_t i = 0; i < keyset.num_keys(); ++i) { dawg_builder->insert(keyset.keys(i), keyset.lengths(i), keyset.values(i)); if (progress_func_ != NULL) { progress_func_(i + 1, keyset.num_keys() + 1); } } dawg_builder->finish(); } inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg) { std::size_t num_units = 1; while (num_units < dawg.size()) { num_units <<= 1; } units_.reserve(num_units); table_.reset(new id_type[dawg.num_intersections()]); for (std::size_t i = 0; i < dawg.num_intersections(); ++i) { table_[i] = 0; } extras_.reset(new extra_type[NUM_EXTRAS]); reserve_id(0); extras(0).set_is_used(true); units_[0].set_offset(1); units_[0].set_label('\0'); if (dawg.child(dawg.root()) != 0) { build_from_dawg(dawg, dawg.root(), 0); } fix_all_blocks(); extras_.clear(); labels_.clear(); table_.clear(); } inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg, id_type dawg_id, id_type dic_id) { id_type dawg_child_id = dawg.child(dawg_id); if (dawg.is_intersection(dawg_child_id)) { id_type intersection_id = dawg.intersection_id(dawg_child_id); id_type offset = table_[intersection_id]; if (offset != 0) { offset ^= dic_id; if (!(offset & UPPER_MASK) || !(offset & LOWER_MASK)) { if (dawg.is_leaf(dawg_child_id)) { units_[dic_id].set_has_leaf(true); } units_[dic_id].set_offset(offset); return; } } } id_type offset = arrange_from_dawg(dawg, dawg_id, dic_id); if (dawg.is_intersection(dawg_child_id)) { table_[dawg.intersection_id(dawg_child_id)] = offset; } do { uchar_type child_label = dawg.label(dawg_child_id); id_type dic_child_id = offset ^ child_label; if (child_label != '\0') { build_from_dawg(dawg, dawg_child_id, dic_child_id); } dawg_child_id = dawg.sibling(dawg_child_id); } while (dawg_child_id != 0); } inline id_type DoubleArrayBuilder::arrange_from_dawg(const DawgBuilder &dawg, id_type dawg_id, id_type dic_id) { labels_.resize(0); id_type dawg_child_id = dawg.child(dawg_id); while (dawg_child_id != 0) { labels_.append(dawg.label(dawg_child_id)); dawg_child_id = dawg.sibling(dawg_child_id); } id_type offset = find_valid_offset(dic_id); units_[dic_id].set_offset(dic_id ^ offset); dawg_child_id = dawg.child(dawg_id); for (std::size_t i = 0; i < labels_.size(); ++i) { id_type dic_child_id = offset ^ labels_[i]; reserve_id(dic_child_id); if (dawg.is_leaf(dawg_child_id)) { units_[dic_id].set_has_leaf(true); units_[dic_child_id].set_value(dawg.value(dawg_child_id)); } else { units_[dic_child_id].set_label(labels_[i]); } dawg_child_id = dawg.sibling(dawg_child_id); } extras(offset).set_is_used(true); return offset; } template void DoubleArrayBuilder::build_from_keyset(const Keyset &keyset) { std::size_t num_units = 1; while (num_units < keyset.num_keys()) { num_units <<= 1; } units_.reserve(num_units); extras_.reset(new extra_type[NUM_EXTRAS]); reserve_id(0); extras(0).set_is_used(true); units_[0].set_offset(1); units_[0].set_label('\0'); if (keyset.num_keys() > 0) { build_from_keyset(keyset, 0, keyset.num_keys(), 0, 0); } fix_all_blocks(); extras_.clear(); labels_.clear(); } template void DoubleArrayBuilder::build_from_keyset(const Keyset &keyset, std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) { id_type offset = arrange_from_keyset(keyset, begin, end, depth, dic_id); while (begin < end) { if (keyset.keys(begin, depth) != '\0') { break; } ++begin; } if (begin == end) { return; } std::size_t last_begin = begin; uchar_type last_label = keyset.keys(begin, depth); while (++begin < end) { uchar_type label = keyset.keys(begin, depth); if (label != last_label) { build_from_keyset(keyset, last_begin, begin, depth + 1, offset ^ last_label); last_begin = begin; last_label = keyset.keys(begin, depth); } } build_from_keyset(keyset, last_begin, end, depth + 1, offset ^ last_label); } template id_type DoubleArrayBuilder::arrange_from_keyset(const Keyset &keyset, std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) { labels_.resize(0); value_type value = -1; for (std::size_t i = begin; i < end; ++i) { uchar_type label = keyset.keys(i, depth); if (label == '\0') { if (keyset.has_lengths() && depth < keyset.lengths(i)) { DARTS_THROW("failed to build double-array: " "invalid null character"); } else if (keyset.values(i) < 0) { DARTS_THROW("failed to build double-array: negative value"); } if (value == -1) { value = keyset.values(i); } if (progress_func_ != NULL) { progress_func_(i + 1, keyset.num_keys() + 1); } } if (labels_.empty()) { labels_.append(label); } else if (label != labels_[labels_.size() - 1]) { if (label < labels_[labels_.size() - 1]) { DARTS_THROW("failed to build double-array: wrong key order"); } labels_.append(label); } } id_type offset = find_valid_offset(dic_id); units_[dic_id].set_offset(dic_id ^ offset); for (std::size_t i = 0; i < labels_.size(); ++i) { id_type dic_child_id = offset ^ labels_[i]; reserve_id(dic_child_id); if (labels_[i] == '\0') { units_[dic_id].set_has_leaf(true); units_[dic_child_id].set_value(value); } else { units_[dic_child_id].set_label(labels_[i]); } } extras(offset).set_is_used(true); return offset; } inline id_type DoubleArrayBuilder::find_valid_offset(id_type id) const { if (extras_head_ >= units_.size()) { return units_.size() | (id & LOWER_MASK); } id_type unfixed_id = extras_head_; do { id_type offset = unfixed_id ^ labels_[0]; if (is_valid_offset(id, offset)) { return offset; } unfixed_id = extras(unfixed_id).next(); } while (unfixed_id != extras_head_); return units_.size() | (id & LOWER_MASK); } inline bool DoubleArrayBuilder::is_valid_offset(id_type id, id_type offset) const { if (extras(offset).is_used()) { return false; } id_type rel_offset = id ^ offset; if ((rel_offset & LOWER_MASK) && (rel_offset & UPPER_MASK)) { return false; } for (std::size_t i = 1; i < labels_.size(); ++i) { if (extras(offset ^ labels_[i]).is_fixed()) { return false; } } return true; } inline void DoubleArrayBuilder::reserve_id(id_type id) { if (id >= units_.size()) { expand_units(); } if (id == extras_head_) { extras_head_ = extras(id).next(); if (extras_head_ == id) { extras_head_ = units_.size(); } } extras(extras(id).prev()).set_next(extras(id).next()); extras(extras(id).next()).set_prev(extras(id).prev()); extras(id).set_is_fixed(true); } inline void DoubleArrayBuilder::expand_units() { id_type src_num_units = units_.size(); id_type src_num_blocks = num_blocks(); id_type dest_num_units = src_num_units + BLOCK_SIZE; id_type dest_num_blocks = src_num_blocks + 1; if (dest_num_blocks > NUM_EXTRA_BLOCKS) { fix_block(src_num_blocks - NUM_EXTRA_BLOCKS); } units_.resize(dest_num_units); if (dest_num_blocks > NUM_EXTRA_BLOCKS) { for (std::size_t id = src_num_units; id < dest_num_units; ++id) { extras(id).set_is_used(false); extras(id).set_is_fixed(false); } } for (id_type i = src_num_units + 1; i < dest_num_units; ++i) { extras(i - 1).set_next(i); extras(i).set_prev(i - 1); } extras(src_num_units).set_prev(dest_num_units - 1); extras(dest_num_units - 1).set_next(src_num_units); extras(src_num_units).set_prev(extras(extras_head_).prev()); extras(dest_num_units - 1).set_next(extras_head_); extras(extras(extras_head_).prev()).set_next(src_num_units); extras(extras_head_).set_prev(dest_num_units - 1); } inline void DoubleArrayBuilder::fix_all_blocks() { id_type begin = 0; if (num_blocks() > NUM_EXTRA_BLOCKS) { begin = num_blocks() - NUM_EXTRA_BLOCKS; } id_type end = num_blocks(); for (id_type block_id = begin; block_id != end; ++block_id) { fix_block(block_id); } } inline void DoubleArrayBuilder::fix_block(id_type block_id) { id_type begin = block_id * BLOCK_SIZE; id_type end = begin + BLOCK_SIZE; id_type unused_offset = 0; for (id_type offset = begin; offset != end; ++offset) { if (!extras(offset).is_used()) { unused_offset = offset; break; } } for (id_type id = begin; id != end; ++id) { if (!extras(id).is_fixed()) { reserve_id(id); units_[id].set_label(static_cast(id ^ unused_offset)); } } } } // namespace Details // // Member function build() of DoubleArrayImpl. // template int DoubleArrayImpl::build(std::size_t num_keys, const key_type * const *keys, const std::size_t *lengths, const value_type *values, Details::progress_func_type progress_func) { Details::Keyset keyset(num_keys, keys, lengths, values); Details::DoubleArrayBuilder builder(progress_func); builder.build(keyset); std::size_t size = 0; unit_type *buf = NULL; builder.copy(&size, &buf); clear(); size_ = size; array_ = buf; buf_ = buf; if (progress_func != NULL) { progress_func(num_keys + 1, num_keys + 1); } return 0; } } // namespace Darts #undef DARTS_INT_TO_STR #undef DARTS_LINE_TO_STR #undef DARTS_LINE_STR #undef DARTS_THROW #endif // DARTS_H_